{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import copy\n",
    "import warnings\n",
    "from tools.data_handling import make_OR_plot, make_boxplot, make_lm_plot\n",
    "from tools.file_utilities import make_folder_if_not_exists\n",
    "import statsmodels.api as sm\n",
    "from statsmodels.formula.api import logit\n",
    "import statsmodels.stats.multitest as smm\n",
    "import statsmodels.formula.api as smf\n",
    "import scipy.stats as stats\n",
    "import scipy as sp\n",
    "import qgrid\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "import plotly.io as pio\n",
    "%matplotlib inline\n",
    "sns.set_style(\"whitegrid\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Signatures to analyse: COSMIC or denovo\n",
    "run_name = 'RCC_961_final'\n",
    "analysis = 'COSMIC'\n",
    "# Mutation type to analyse\n",
    "mutation_types = ['SBS', 'DBS', 'ID'] #,'CNV', 'SV'] #\n",
    " \n",
    "SBS_context = 1536\n",
    "if analysis=='COSMIC':\n",
    "    if 'SV' in mutation_types:\n",
    "        mutation_types.remove('SV')\n",
    "    SBS_context = 96\n",
    "\n",
    "adjust_regressions = True # adjust all regressions by sex and age\n",
    "log_transform = False # log transform frequent signatures and burdens\n",
    "show_median = False # plot median instead of mean values\n",
    "show_error_bars = True # show error bars (std) on plots\n",
    "show_r_squared = False # show R squared for linear regressions\n",
    "adjust_TMB = False # adjust TMB for sex, age and stage in plots\n",
    "use_robust_regressions = False # use robust regressions\n",
    "exclude_AA_outliers = False # exclude outlier cases with >10% of SBS22a or SBS22b (or denovo counterparts)\n",
    "exclude_Japanese_outliers = False # exclude outlier cases with >10% of SBS12 (or denovo counterpart)\n",
    "relative_attributions = False # use relative instead of absolute values of attributions\n",
    "drop_AA_countries = False # drop Romania, Serbia and Thailand\n",
    "only_early_stage = False # Limit only to stage I and stage II tumours\n",
    "only_czech_republic = False # only analyse counties within Czechia\n",
    "sex_dependent_ASR = False # use sex-dependent ASR\n",
    "males_only = False # males-only regression\n",
    "females_only = False # females-only regression\n",
    "drop_thailand_in_adjusted_regressions = False # drop Thailand in regressions\n",
    "linear_fit_over_countries = False # only for unadjusted regressions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "run_name += '_' + '_'.join(mutation_types)\n",
    "if log_transform:\n",
    "    run_name += '_log'\n",
    "if show_median:\n",
    "    run_name += '_median'\n",
    "if show_error_bars:\n",
    "    run_name += '_error_bars'\n",
    "if adjust_TMB:\n",
    "    run_name += '_adj_TMB'\n",
    "if relative_attributions:\n",
    "    run_name += '_rel_attr'\n",
    "if use_robust_regressions:\n",
    "    run_name += '_RML'\n",
    "if drop_AA_countries:\n",
    "    run_name += '_noAAcountries'\n",
    "if only_early_stage:\n",
    "    run_name += '_only_early_stage'\n",
    "if only_czech_republic:\n",
    "    run_name += '_only_czech'\n",
    "if linear_fit_over_countries:\n",
    "    run_name += '_linear_fit_over_countries'\n",
    "if sex_dependent_ASR:\n",
    "    run_name += '_sex_dependent_ASR'\n",
    "if males_only:\n",
    "    run_name += '_males_only'\n",
    "if females_only:\n",
    "    run_name += '_females_only'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import the metadata\n",
    "data_merged = pd.read_csv('data_merged.csv',index_col=0)\n",
    "\n",
    "# renaming for brevity\n",
    "data_merged = data_merged.replace('Czech Republic', 'Czechia')\n",
    "data_merged = data_merged.replace('United Kingdom', 'UK')\n",
    "data_merged = data_merged.replace('Ceske Budejovice', 'Ceske B.')\n",
    "\n",
    "if only_czech_republic:\n",
    "    # Only Czechia:\n",
    "    data_merged = data_merged[data_merged.country == 'Czechia']\n",
    "    czech_residency = pd.read_csv('metadata/extra/cz_residency.csv', index_col=0, encoding= 'unicode_escape')\n",
    "    data_merged = pd.concat([data_merged, czech_residency[['county','Incidence']]], axis=1, join='inner')\n",
    "    data_merged = data_merged.replace('Hlavní mesto Praha', 'Prague region')\n",
    "    data_merged = data_merged.replace('Jihoceský kraj', 'Ceske Budejovice region')\n",
    "    data_merged = data_merged.replace('Olomoucký kraj', 'Olomouc region')\n",
    "    data_merged = data_merged.replace('Jihomoravský kraj', 'Brno region')\n",
    "elif drop_AA_countries:\n",
    "    # drop Romania and Serbia\n",
    "    data_merged = data_merged[data_merged.country != 'Romania']\n",
    "    data_merged = data_merged[data_merged.country != 'Serbia']\n",
    "    data_merged = data_merged[data_merged.country != 'Thailand']\n",
    "    \n",
    "if males_only:\n",
    "    data_merged = data_merged[data_merged.sex == 'Male']\n",
    "elif females_only:\n",
    "    data_merged = data_merged[data_merged.sex == 'Female']\n",
    "\n",
    "if only_early_stage:\n",
    "    data_merged = data_merged.query(\"stage == 'I' or stage == 'II'\")\n",
    "\n",
    "# qgrid.show_grid(data_merged, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e58116762d1e4181aa0ef78ea68f17e9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# import signature attributions\n",
    "if only_czech_republic:\n",
    "    merged_sigs_with_metadata = data_merged[['country','county', 'Incidence', 'sex', 'age_diag', 'stage']].copy()\n",
    "else:\n",
    "    merged_sigs_with_metadata = data_merged[['country', 'sex', 'age_diag', 'stage']].copy()\n",
    "\n",
    "# merged_sigs_with_metadata.index = merged_sigs_with_metadata.index.str.replace(r'a', '')\n",
    "for mutation_type in mutation_types:\n",
    "    context = SBS_context if mutation_type=='SBS' else 78 if mutation_type=='DBS' else 83 if mutation_type=='ID' else 32 if mutation_type=='SV' else 48 if mutation_type=='CNV' else 'NA'\n",
    "    mutation_type_context = mutation_type + str(context)\n",
    "    if analysis=='COSMIC':\n",
    "#     v14 COSMIC nominal\n",
    "        sigs_CIs = pd.read_csv('./MSA_attribution/962_Consensus_v3/cosmic_noCI_penalties/SigProfilerExtractor/%s/output_tables/CIs_RCC_Manuscript_COSMIC_%s_bootstrap_output_weights.csv' % (mutation_type_context, mutation_type_context), index_col=0)\n",
    "        sigs_abs = pd.read_csv('./MSA_attribution/962_Consensus_v3/cosmic_noCI_penalties/SigProfilerExtractor/%s/output_tables/output_RCC_Manuscript_COSMIC_%s_mutations_table.csv' % (mutation_type_context, mutation_type_context), index_col=0)\n",
    "        stat_inf = pd.read_csv('./MSA_attribution/962_Consensus_v3/cosmic_noCI_penalties/SigProfilerExtractor/%s/output_tables/output_RCC_Manuscript_COSMIC_%s_stat_info.csv' % (mutation_type_context, mutation_type_context), index_col=0)\n",
    "        sigs_abs = sigs_abs.add_suffix('_abs')\n",
    "    elif analysis=='denovo':\n",
    "        sigs_CIs = pd.read_csv('./MSA_attribution/962_Consensus_v3/denovo_noCI_penalties/SigProfilerExtractor/%s/output_tables/CIs_RCC_Manuscript_denovo_%s_bootstrap_output_weights.csv' % (mutation_type_context, mutation_type_context), index_col=0)\n",
    "        sigs_abs = pd.read_csv('./MSA_attribution/962_Consensus_v3/denovo_noCI_penalties/SigProfilerExtractor/%s/output_tables/output_RCC_Manuscript_denovo_%s_mutations_table.csv' % (mutation_type_context, mutation_type_context), index_col=0)\n",
    "        stat_inf = pd.read_csv('./MSA_attribution/962_Consensus_v3/denovo_noCI_penalties/SigProfilerExtractor/%s/output_tables/output_RCC_Manuscript_denovo_%s_stat_info.csv' % (mutation_type_context, mutation_type_context), index_col=0)\n",
    "        sigs_abs = sigs_abs.add_suffix('_abs')\n",
    "    stat_inf = stat_inf.rename(columns={'Mutational burden': mutation_type + '_burden'})\n",
    "    # merge with metadata\n",
    "    merged_sigs_with_metadata = pd.concat([sigs_CIs, merged_sigs_with_metadata], axis=1)#.fillna(0)\n",
    "    merged_sigs_with_metadata = pd.concat([sigs_abs, merged_sigs_with_metadata], axis=1)#.fillna(0)\n",
    "    merged_sigs_with_metadata = pd.concat([stat_inf[mutation_type + '_burden'], merged_sigs_with_metadata], axis=1)#.fillna(0)\n",
    "#     merged_sigs_with_metadata = pd.merge(sigs_CIs, merged_sigs_with_metadata, left_index=True, right_index=True)\n",
    "#     merged_sigs_with_metadata = pd.merge(sigs_abs, merged_sigs_with_metadata, left_index=True, right_index=True)\n",
    "if drop_AA_countries or only_early_stage or only_czech_republic:\n",
    "    merged_sigs_with_metadata.dropna(subset=['country'], inplace=True)\n",
    "if males_only or females_only:\n",
    "    merged_sigs_with_metadata.dropna(subset=['sex'], inplace=True)\n",
    "# rename SV/CNV signatures if present\n",
    "merged_sigs_with_metadata.columns = merged_sigs_with_metadata.columns.str.replace(\"SBSSV\", \"SV_\")\n",
    "merged_sigs_with_metadata.columns = merged_sigs_with_metadata.columns.str.replace(\"SBSCNV\", \"CNV_\")\n",
    "qgrid.show_grid(merged_sigs_with_metadata, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "for mutation_type in mutation_types:\n",
    "    columns_to_scan = merged_sigs_with_metadata.columns \n",
    "    if mutation_type == 'CNV' and analysis=='COSMIC':\n",
    "        mutation_type = 'CN'\n",
    "        mutation_types = [item.replace('CNV', 'CN') for item in mutation_types]\n",
    "        merged_sigs_with_metadata.rename({'CNV_burden':'CN_burden'},axis=1,inplace=True)\n",
    "    if 'SBS' in mutation_type:\n",
    "        # SP output idiosyncrasy\n",
    "        columns_to_scan = [ col for col in columns_to_scan if \"CNV\" not in col ]\n",
    "        columns_to_scan = [ col for col in columns_to_scan if \"SV\" not in col ]\n",
    "    sigs = [col for col in columns_to_scan if mutation_type in col\n",
    "            and not '_abs' in col and not 'CI' in col and not 'TMB' in col and not 'burden' in col]\n",
    "    for signature in sigs:\n",
    "        for sample in merged_sigs_with_metadata.index:\n",
    "            attribution = merged_sigs_with_metadata.loc[sample, signature + '_abs']\n",
    "            CI_string = merged_sigs_with_metadata.loc[sample, signature]\n",
    "            if not pd.isnull(CI_string):\n",
    "                CI_string = CI_string.translate({ord(i):None for i in '[,]'})\n",
    "                central, lower_CI, upper_CI = [float(x) for x in CI_string.split()]\n",
    "                if lower_CI != 0:\n",
    "                    merged_sigs_with_metadata.loc[sample, signature + '_CI'] = attribution\n",
    "                    if relative_attributions:\n",
    "                        merged_sigs_with_metadata.loc[sample, signature + '_CI'] = attribution/merged_sigs_with_metadata.loc[sample, mutation_type + '_burden']\n",
    "                else:\n",
    "                    merged_sigs_with_metadata.loc[sample, signature + '_CI'] = 0\n",
    "            else:\n",
    "                central, lower_CI, upper_CI = [np.nan, np.nan, np.nan]\n",
    "                merged_sigs_with_metadata.loc[sample, signature + '_CI'] = np.nan\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "961\n"
     ]
    }
   ],
   "source": [
    "# drop Serbian hypermutator\n",
    "if 'PD47592a' in merged_sigs_with_metadata.index:\n",
    "    merged_sigs_with_metadata.drop('PD47592a', axis=0, inplace=True)\n",
    "print(len(merged_sigs_with_metadata))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "961\n"
     ]
    }
   ],
   "source": [
    "# exclude outliers\n",
    "if 'SBS' in mutation_types:\n",
    "    normalisation_factor = 1 if relative_attributions else merged_sigs_with_metadata['SBS_burden']\n",
    "    if exclude_AA_outliers:\n",
    "        SBS22a_label = 'SBS22_CI' if analysis=='COSMIC' else 'SBS1536D_CI'\n",
    "        SBS22b_label = 'SBS1536I_CI'\n",
    "        SBS22a_rel = merged_sigs_with_metadata[SBS22a_label]/normalisation_factor\n",
    "        SBS22b_rel = merged_sigs_with_metadata[SBS22b_label]/normalisation_factor\n",
    "        merged_sigs_with_metadata = merged_sigs_with_metadata[SBS22a_rel<0.1]\n",
    "        merged_sigs_with_metadata = merged_sigs_with_metadata[SBS22b_rel<0.1]\n",
    "    if exclude_Japanese_outliers:\n",
    "        SBS12_label = 'SBS12_CI' if analysis=='COSMIC' else 'SBS1536M_CI'\n",
    "        SBS12_rel = merged_sigs_with_metadata[SBS12_label]/normalisation_factor\n",
    "        merged_sigs_with_metadata = merged_sigs_with_metadata[SBS12_rel<0.1]\n",
    "print(len(merged_sigs_with_metadata))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['SBS1', 'SBS2', 'SBS4', 'SBS5', 'SBS12', 'SBS13', 'SBS18', 'SBS21', 'SBS22', 'SBS44', 'SBS1536A', 'SBS1536B', 'SBS1536F', 'SBS1536I', 'DBS2', 'DBS4', 'DBS9', 'DBS78C', 'DBS78D', 'ID1', 'ID2', 'ID3', 'ID5', 'ID8', 'ID9', 'ID11', 'ID12', 'ID83C']\n"
     ]
    }
   ],
   "source": [
    "# signatures and parameters to consider\n",
    "signatures = []\n",
    "for mutation_type in mutation_types:\n",
    "    columns_to_scan = merged_sigs_with_metadata.columns \n",
    "    if mutation_type == 'CNV' and analysis=='COSMIC':\n",
    "        mutation_type = 'CN'\n",
    "        mutation_types = [item.replace('CNV', 'CN') for item in mutation_types]\n",
    "    if 'SBS' in mutation_type:\n",
    "        # SP output idiosyncrasy\n",
    "        columns_to_scan = [ col for col in columns_to_scan if \"CNV\" not in col ]\n",
    "        columns_to_scan = [ col for col in columns_to_scan if \"SV\" not in col ]\n",
    "    signatures += [col for col in columns_to_scan if\n",
    "                   mutation_type in col and not '_abs' in col and not '_CI' in col and not 'burden' in col]\n",
    "print(signatures)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['SBS1', 'SBS2', 'SBS4', 'SBS5', 'SBS12', 'SBS13', 'SBS18', 'SBS22', 'SBS44', 'SBS1536A', 'SBS1536B', 'SBS1536F', 'SBS1536I', 'DBS2', 'DBS4', 'DBS9', 'DBS78C', 'DBS78D', 'ID1', 'ID2', 'ID3', 'ID5', 'ID8', 'ID11', 'ID12', 'ID83C']\n"
     ]
    }
   ],
   "source": [
    "# limit signatures to those present in at least 5 cases\n",
    "signature_counts = {}\n",
    "for signature in signatures:\n",
    "    signature_counts[signature] = 0\n",
    "    for sample in merged_sigs_with_metadata.index:\n",
    "        CI_string = merged_sigs_with_metadata.loc[sample, signature]\n",
    "        if not pd.isnull(CI_string):\n",
    "            CI_string = CI_string.translate({ord(i):None for i in '[,]'})\n",
    "            central, lower_CI, upper_CI = [float(x) for x in CI_string.split()]\n",
    "            if lower_CI!=0:\n",
    "                signature_counts[signature] += 1\n",
    "        else:\n",
    "            central, lower_CI, upper_CI = [np.nan, np.nan, np.nan]\n",
    "signatures_with_at_least_5_cases = [sig for sig in signature_counts.keys() if signature_counts[sig] > 5]\n",
    "signatures = signatures_with_at_least_5_cases\n",
    "print(signatures)\n",
    "# print([sig for sig in signature_counts.keys() if signature_counts[sig] <= 5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ASR from Globocan\n",
    "countries = ['Lithuania','Czechia','Canada','Russia','UK','Poland','Romania','Japan','Serbia','Brazil','Thailand']\n",
    "countries_with_ASR = pd.DataFrame(index=countries, columns = ['ASR','ASR_m', 'ASR_f'], dtype='float')\n",
    "countries_with_ASR.loc['Lithuania'] = [14.5, 20.7, 10.1]\n",
    "countries_with_ASR.loc['Czechia'] = [14.4, 20.5, 9.0]\n",
    "countries_with_ASR.loc['Canada'] = [10.4, 13.9, 7.1]\n",
    "countries_with_ASR.loc['Russia'] = [10.3, 14.1, 7.7]\n",
    "countries_with_ASR.loc['UK'] = [10.3, 13.2, 7.6]\n",
    "countries_with_ASR.loc['Poland'] = [8.1, 11.4, 5.4]\n",
    "countries_with_ASR.loc['Romania'] = [7.7, 10.7, 5.0]\n",
    "countries_with_ASR.loc['Japan'] = [7.6, 11.2, 4.2]\n",
    "countries_with_ASR.loc['Serbia'] = [7.4, 10.6, 4.6]\n",
    "countries_with_ASR.loc['Brazil'] = [4.5, 5.8, 3.3]\n",
    "countries_with_ASR.loc['Thailand'] = [1.8, 2.7, 1.2]\n",
    "countries_with_ASR['country'] = countries_with_ASR.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2d66a07eb7b74499b8ab9cd9aa959870",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "if only_czech_republic:\n",
    "    # counties with at least 5 cases\n",
    "    merged_sigs_with_metadata = merged_sigs_with_metadata[merged_sigs_with_metadata.groupby('county')['county'].transform('size') >= 5]\n",
    "    merged_sigs_with_metadata.rename(columns={\"Incidence\": \"ASR\"}, inplace=True)\n",
    "    stripped_dataframe = merged_sigs_with_metadata.drop_duplicates(subset='ASR', keep=\"last\")[['ASR','county']]\n",
    "    counties = merged_sigs_with_metadata['county'].unique()\n",
    "    stripped_dataframe.set_index('county', inplace=True)\n",
    "    stripped_dataframe['county'] = stripped_dataframe.index\n",
    "    counties_with_ASR = stripped_dataframe.copy()\n",
    "else:\n",
    "    for case in merged_sigs_with_metadata.index:\n",
    "        country = merged_sigs_with_metadata.loc[case, 'country']\n",
    "        if sex_dependent_ASR:\n",
    "            sex = merged_sigs_with_metadata.loc[case, 'sex']\n",
    "            if sex=='Male':\n",
    "                merged_sigs_with_metadata.loc[case, 'ASR'] = countries_with_ASR.loc[country, 'ASR_m']\n",
    "            else:\n",
    "                merged_sigs_with_metadata.loc[case, 'ASR'] = countries_with_ASR.loc[country, 'ASR_f']\n",
    "        else:\n",
    "            merged_sigs_with_metadata.loc[case, 'ASR'] = countries_with_ASR.loc[country, 'ASR']\n",
    "qgrid.show_grid(merged_sigs_with_metadata, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_sigs_with_metadata_local = merged_sigs_with_metadata.copy()\n",
    "# regressions to adjust TMB in plots\n",
    "if adjust_TMB:\n",
    "    for mtype in mutation_types:\n",
    "        # age-adjusted regression\n",
    "        TMB_age_regression = smf.ols(formula=\"%s_burden ~ %s age_diag\" % (mtype, 'sex + ' if not (males_only or females_only) else ''), data=merged_sigs_with_metadata_local).fit()\n",
    "    #     print(TMB_age_regression.summary())\n",
    "        merged_sigs_with_metadata_local['%s_burden_age_adj' % mtype] = TMB_age_regression.resid + merged_sigs_with_metadata_local['%s_burden' % mtype].mean()\n",
    "        # age and stage-adjusted regression (missing values e.g. for Thailand)\n",
    "        merged_sigs_with_metadata_local['stage'].replace('Missing', np.nan, inplace=True)\n",
    "        TMB_age_stage_regression = smf.ols(formula=\"%s_burden ~ %s stage + age_diag\" % (mtype, 'sex + ' if not (males_only or females_only) else ''), data=merged_sigs_with_metadata_local).fit()\n",
    "    #     print(TMB_age_stage_regression.summary())\n",
    "        merged_sigs_with_metadata_local['%s_burden_age_stage_adj' % mtype] = TMB_age_stage_regression.resid + merged_sigs_with_metadata_local['%s_burden' % mtype].mean()\n",
    "        # all burdens are age and stage-adjusted\n",
    "        merged_sigs_with_metadata_local['%s_burden' % mtype] = merged_sigs_with_metadata_local['%s_burden_age_stage_adj' % mtype]\n",
    "        # replace missing TMB for Thailand cases which don't have stage information with age-adjusted values\n",
    "        merged_sigs_with_metadata_local.loc[merged_sigs_with_metadata_local['country'] == 'Thailand', '%s_burden' % mtype] = merged_sigs_with_metadata_local.loc[merged_sigs_with_metadata_local['country'] == 'Thailand', '%s_burden_age_adj' % mtype]\n",
    "for signature in signatures_with_at_least_5_cases + [mtype + '_burden' for mtype in mutation_types]:\n",
    "    for country in countries:\n",
    "        restricted_dataframe = merged_sigs_with_metadata_local[merged_sigs_with_metadata_local['country']==country].copy()\n",
    "        if restricted_dataframe.empty:\n",
    "            mean_value = median_value = sig_frequency = sig_minus_error = sig_plus_error = np.nan\n",
    "        else:\n",
    "            signature_name = signature + '_CI' if not 'burden' in signature else signature\n",
    "            mean_value = restricted_dataframe[signature_name].mean()\n",
    "            median_value = restricted_dataframe[signature_name].median()\n",
    "            cases_with_sig = len(restricted_dataframe[restricted_dataframe[signature_name]>0])\n",
    "            sig_frequency = cases_with_sig/len(restricted_dataframe) if not 'burden' in signature else np.nan\n",
    "            # standard deviation:\n",
    "            sig_minus_error = sig_plus_error = np.std(restricted_dataframe[signature_name])\n",
    "            # standard error of the mean:\n",
    "            sig_minus_error = sig_plus_error = np.std(restricted_dataframe[signature_name], ddof=1) / np.sqrt(np.size(restricted_dataframe[signature_name]))\n",
    "        if show_median:\n",
    "            countries_with_ASR.loc[country, signature] = median_value\n",
    "            # IQR:\n",
    "            sig_minus_error = median_value-restricted_dataframe[signature_name].quantile(0.25)\n",
    "            sig_plus_error = restricted_dataframe[signature_name].quantile(0.75)-median_value\n",
    "        else:\n",
    "            countries_with_ASR.loc[country, signature] = mean_value\n",
    "        countries_with_ASR.loc[country, [signature + '_freq', signature + '_minus_error', signature + '_plus_error', 'N']] = [sig_frequency, sig_minus_error, sig_plus_error, len(restricted_dataframe)]\n",
    "    if only_czech_republic:\n",
    "        for county in counties:\n",
    "            restricted_dataframe = merged_sigs_with_metadata_local[merged_sigs_with_metadata_local['county']==county]\n",
    "            if 'burden' in signature and adjust_TMB:\n",
    "                TMB_age_regression = smf.ols(formula=\"%s ~ %s age_diag\" % (signature, 'sex + ' if not (males_only or females_only) else ''), data=restricted_dataframe).fit()\n",
    "                restricted_dataframe.loc[:, signature] = TMB_age_regression.get_prediction().predicted_mean\n",
    "            if restricted_dataframe.empty:\n",
    "                mean_value = median_value = sig_frequency = sig_minus_error = sig_plus_error = np.nan\n",
    "            else:\n",
    "                signature_name = signature + '_CI' if not 'burden' in signature else signature\n",
    "                signature_name = signature + '_CI' if not 'burden' in signature else signature\n",
    "                mean_value = restricted_dataframe[signature_name].mean()\n",
    "                median_value = restricted_dataframe[signature_name].median()\n",
    "                cases_with_sig = len(restricted_dataframe[restricted_dataframe[signature_name]>0])\n",
    "                sig_frequency = cases_with_sig/len(restricted_dataframe) if not 'burden' in signature else np.nan\n",
    "                # standard deviation:\n",
    "                sig_minus_error = sig_plus_error = np.std(restricted_dataframe[signature_name])\n",
    "                # standard error of the mean:\n",
    "                sig_minus_error = sig_plus_error = np.std(restricted_dataframe[signature_name], ddof=1) / np.sqrt(np.size(restricted_dataframe[signature_name]))\n",
    "            if show_median:\n",
    "                counties_with_ASR.loc[county, signature] = median_value\n",
    "                # IQR:\n",
    "                sig_minus_error = median_value-restricted_dataframe[signature_name].quantile(0.25)\n",
    "                sig_plus_error = restricted_dataframe[signature_name].quantile(0.75)-median_value\n",
    "            else:\n",
    "                counties_with_ASR.loc[county, signature] = mean_value\n",
    "            counties_with_ASR.loc[county, [signature + '_freq', signature + '_minus_error', signature + '_plus_error', 'N']] = [sig_frequency, sig_minus_error, sig_plus_error, len(restricted_dataframe)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "if males_only:\n",
    "    countries_with_ASR['ASR'] = countries_with_ASR['ASR_m']\n",
    "elif females_only:\n",
    "    countries_with_ASR['ASR'] = countries_with_ASR['ASR_f']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4f4b0693c6304c6eb975aab7ab81d0e6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "qgrid.show_grid(countries_with_ASR, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.69\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.037284\n",
      "         Iterations 10\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.32\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.658612\n",
      "         Iterations 5\n",
      "Freq: 0.5733610822060354\n",
      "Logit: True\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:              SBS4_bool   No. Observations:                  961\n",
      "Model:                          Logit   Df Residuals:                      957\n",
      "Method:                           MLE   Df Model:                            3\n",
      "Date:                Tue, 06 Feb 2024   Pseudo R-squ.:                 0.03478\n",
      "Time:                        16:14:33   Log-Likelihood:                -632.93\n",
      "converged:                       True   LL-Null:                       -655.73\n",
      "Covariance Type:            nonrobust   LLR p-value:                 6.853e-10\n",
      "===============================================================================\n",
      "                  coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept      -2.3671      0.425     -5.565      0.000      -3.201      -1.533\n",
      "sex[T.Male]     0.0070      0.137      0.051      0.959      -0.262       0.276\n",
      "ASR             0.0563      0.021      2.641      0.008       0.015       0.098\n",
      "age_diag        0.0348      0.006      5.916      0.000       0.023       0.046\n",
      "===============================================================================\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.0083\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.259240\n",
      "         Iterations 7\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.75\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.174173\n",
      "         Iterations 8\n",
      "Freq: 0.046826222684703434\n",
      "Logit: True\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:             SBS12_bool   No. Observations:                  961\n",
      "Model:                          Logit   Df Residuals:                      957\n",
      "Method:                           MLE   Df Model:                            3\n",
      "Date:                Tue, 06 Feb 2024   Pseudo R-squ.:                 0.07875\n",
      "Time:                        16:14:33   Log-Likelihood:                -167.38\n",
      "converged:                       True   LL-Null:                       -181.69\n",
      "Covariance Type:            nonrobust   LLR p-value:                 2.695e-06\n",
      "===============================================================================\n",
      "                  coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept      -4.3164      1.046     -4.126      0.000      -6.367      -2.266\n",
      "sex[T.Male]     0.6513      0.343      1.899      0.058      -0.021       1.323\n",
      "ASR            -0.1965      0.049     -3.999      0.000      -0.293      -0.100\n",
      "age_diag        0.0439      0.014      3.102      0.002       0.016       0.072\n",
      "===============================================================================\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=6.4e-05\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.434017\n",
      "         Iterations 6\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.054\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.259210\n",
      "         Iterations 7\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.47\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.219720\n",
      "         Iterations 8\n",
      "Freq: 0.07388137356919876\n",
      "Logit: True\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:             SBS22_bool   No. Observations:                  961\n",
      "Model:                          Logit   Df Residuals:                      957\n",
      "Method:                           MLE   Df Model:                            3\n",
      "Date:                Tue, 06 Feb 2024   Pseudo R-squ.:                  0.1664\n",
      "Time:                        16:14:34   Log-Likelihood:                -211.15\n",
      "converged:                       True   LL-Null:                       -253.29\n",
      "Covariance Type:            nonrobust   LLR p-value:                 3.722e-18\n",
      "===============================================================================\n",
      "                  coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept      -2.5003      0.846     -2.955      0.003      -4.158      -0.842\n",
      "sex[T.Male]    -0.2148      0.264     -0.814      0.416      -0.732       0.302\n",
      "ASR            -0.3323      0.044     -7.481      0.000      -0.419      -0.245\n",
      "age_diag        0.0495      0.012      4.117      0.000       0.026       0.073\n",
      "===============================================================================\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=7.4e-14\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.036591\n",
      "         Iterations 10\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.15\n",
      "Freq: 0.8636836628511967\n",
      "Logit: False\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:            SBS1536A_CI   R-squared:                       0.241\n",
      "Model:                            OLS   Adj. R-squared:                  0.239\n",
      "Method:                 Least Squares   F-statistic:                     101.5\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           4.56e-57\n",
      "Time:                        16:14:35   Log-Likelihood:                -7639.6\n",
      "No. Observations:                 961   AIC:                         1.529e+04\n",
      "Df Residuals:                     957   BIC:                         1.531e+04\n",
      "Df Model:                           3                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept   -1196.2149    137.314     -8.712      0.000   -1465.686    -926.744\n",
      "sex[T.Male]   233.1231     45.549      5.118      0.000     143.736     322.510\n",
      "ASR            62.2369      7.053      8.825      0.000      48.396      76.077\n",
      "age_diag       26.2063      1.897     13.815      0.000      22.484      29.929\n",
      "==============================================================================\n",
      "Omnibus:                       81.338   Durbin-Watson:                   1.829\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              137.915\n",
      "Skew:                           0.590   Prob(JB):                     1.13e-30\n",
      "Kurtosis:                       4.433   Cond. No.                         386.\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=5.1e-18\n",
      "Freq: 0.8959417273673257\n",
      "Logit: False\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:            SBS1536B_CI   R-squared:                       0.080\n",
      "Model:                            OLS   Adj. R-squared:                  0.077\n",
      "Method:                 Least Squares   F-statistic:                     27.57\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           4.21e-17\n",
      "Time:                        16:14:35   Log-Likelihood:                -7908.7\n",
      "No. Observations:                 961   AIC:                         1.583e+04\n",
      "Df Residuals:                     957   BIC:                         1.584e+04\n",
      "Df Model:                           3                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept      61.3250    181.686      0.338      0.736    -295.223     417.873\n",
      "sex[T.Male]    35.7460     60.267      0.593      0.553     -82.526     154.018\n",
      "ASR            28.6025      9.332      3.065      0.002      10.290      46.915\n",
      "age_diag       20.9849      2.510      8.361      0.000      16.059      25.911\n",
      "==============================================================================\n",
      "Omnibus:                       99.784   Durbin-Watson:                   2.104\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              318.772\n",
      "Skew:                           0.492   Prob(JB):                     6.02e-70\n",
      "Kurtosis:                       5.645   Cond. No.                         386.\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "Number of tests =  29\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "p<sub>ASR</sub>=0.0022\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.368168\n",
      "         Iterations 6\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.77\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.273874\n",
      "         Iterations 7\n",
      "Freq: 0.09885535900104059\n",
      "Logit: True\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:          SBS1536I_bool   No. Observations:                  961\n",
      "Model:                          Logit   Df Residuals:                      957\n",
      "Method:                           MLE   Df Model:                            3\n",
      "Date:                Tue, 06 Feb 2024   Pseudo R-squ.:                  0.1509\n",
      "Time:                        16:14:36   Log-Likelihood:                -263.19\n",
      "converged:                       True   LL-Null:                       -309.98\n",
      "Covariance Type:            nonrobust   LLR p-value:                 3.736e-20\n",
      "===============================================================================\n",
      "                  coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept      -2.2520      0.736     -3.059      0.002      -3.695      -0.809\n",
      "sex[T.Male]    -0.0617      0.233     -0.265      0.791      -0.519       0.395\n",
      "ASR            -0.3089      0.039     -7.980      0.000      -0.385      -0.233\n",
      "age_diag        0.0470      0.010      4.487      0.000       0.026       0.068\n",
      "===============================================================================\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=1.5e-15\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.671597\n",
      "         Iterations 5\n",
      "Number of tests =  29\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-16-dab4e559e1ca>:124: SettingWithCopyWarning:\n",
      "\n",
      "\n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n",
      "<ipython-input-16-dab4e559e1ca>:125: SettingWithCopyWarning:\n",
      "\n",
      "\n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "p<sub>ASR</sub>=0.18\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.319509\n",
      "         Iterations 7\n",
      "Number of tests =  29\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-16-dab4e559e1ca>:124: SettingWithCopyWarning:\n",
      "\n",
      "\n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n",
      "<ipython-input-16-dab4e559e1ca>:125: SettingWithCopyWarning:\n",
      "\n",
      "\n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "p<sub>ASR</sub>=0.0044\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.167347\n",
      "         Iterations 7\n",
      "Number of tests =  29\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-16-dab4e559e1ca>:124: SettingWithCopyWarning:\n",
      "\n",
      "\n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n",
      "<ipython-input-16-dab4e559e1ca>:125: SettingWithCopyWarning:\n",
      "\n",
      "\n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "p<sub>ASR</sub>=0.33\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.335175\n",
      "         Iterations 6\n",
      "Number of tests =  29\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-16-dab4e559e1ca>:124: SettingWithCopyWarning:\n",
      "\n",
      "\n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n",
      "<ipython-input-16-dab4e559e1ca>:125: SettingWithCopyWarning:\n",
      "\n",
      "\n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "p<sub>ASR</sub>=0.51\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.226648\n",
      "         Iterations 7\n",
      "Number of tests =  29\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-16-dab4e559e1ca>:124: SettingWithCopyWarning:\n",
      "\n",
      "\n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n",
      "<ipython-input-16-dab4e559e1ca>:125: SettingWithCopyWarning:\n",
      "\n",
      "\n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "p<sub>ASR</sub>=7.9e-05\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.51\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.084060\n",
      "         Iterations 8\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.44\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.174461\n",
      "         Iterations 7\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.12\n",
      "Freq: 0.9302809573361083\n",
      "Logit: False\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                 ID5_CI   R-squared:                       0.225\n",
      "Model:                            OLS   Adj. R-squared:                  0.223\n",
      "Method:                 Least Squares   F-statistic:                     92.74\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           1.09e-52\n",
      "Time:                        16:14:38   Log-Likelihood:                -6541.6\n",
      "No. Observations:                 961   AIC:                         1.309e+04\n",
      "Df Residuals:                     957   BIC:                         1.311e+04\n",
      "Df Model:                           3                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept    -301.7059     43.802     -6.888      0.000    -387.664    -215.748\n",
      "sex[T.Male]    56.4194     14.530      3.883      0.000      27.906      84.933\n",
      "ASR            14.6112      2.250      6.495      0.000      10.196      19.026\n",
      "age_diag        8.8552      0.605     14.634      0.000       7.668      10.043\n",
      "==============================================================================\n",
      "Omnibus:                       39.671   Durbin-Watson:                   1.799\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               47.465\n",
      "Skew:                           0.442   Prob(JB):                     4.93e-11\n",
      "Kurtosis:                       3.637   Cond. No.                         386.\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=1.3e-10\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.535709\n",
      "         Iterations 6\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=7.1e-05\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.045507\n",
      "         Iterations 10\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.2\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.035098\n",
      "         Iterations 10\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.1\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.079942\n",
      "         Iterations 9\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.0011\n",
      "Freq: 1\n",
      "Logit: False\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:             SBS_burden   R-squared:                       0.151\n",
      "Model:                            OLS   Adj. R-squared:                  0.149\n",
      "Method:                 Least Squares   F-statistic:                     56.92\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           7.35e-34\n",
      "Time:                        16:14:40   Log-Likelihood:                -9211.2\n",
      "No. Observations:                 961   AIC:                         1.843e+04\n",
      "Df Residuals:                     957   BIC:                         1.845e+04\n",
      "Df Model:                           3                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept   -1012.4105    704.584     -1.437      0.151   -2395.118     370.297\n",
      "sex[T.Male]   648.5058    233.719      2.775      0.006     189.845    1107.167\n",
      "ASR          -102.0744     36.188     -2.821      0.005    -173.092     -31.057\n",
      "age_diag      124.5838      9.734     12.799      0.000     105.482     143.685\n",
      "==============================================================================\n",
      "Omnibus:                     1002.908   Durbin-Watson:                   1.570\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):            60461.282\n",
      "Skew:                           4.943   Prob(JB):                         0.00\n",
      "Kurtosis:                      40.579   Cond. No.                         386.\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.0049\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.033\n",
      "Freq: 1\n",
      "Logit: False\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:              ID_burden   R-squared:                       0.050\n",
      "Model:                            OLS   Adj. R-squared:                  0.047\n",
      "Method:                 Least Squares   F-statistic:                     16.78\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           1.25e-10\n",
      "Time:                        16:14:40   Log-Likelihood:                -7725.2\n",
      "No. Observations:                 961   AIC:                         1.546e+04\n",
      "Df Residuals:                     957   BIC:                         1.548e+04\n",
      "Df Model:                           3                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept    -198.9059    150.098     -1.325      0.185    -493.465      95.654\n",
      "sex[T.Male]   125.4067     49.789      2.519      0.012      27.698     223.116\n",
      "ASR             9.3199      7.709      1.209      0.227      -5.809      24.449\n",
      "age_diag       13.6685      2.074      6.592      0.000       9.599      17.738\n",
      "==============================================================================\n",
      "Omnibus:                     1836.191   Durbin-Watson:                   1.944\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):          2293390.931\n",
      "Skew:                          13.804   Prob(JB):                         0.00\n",
      "Kurtosis:                     240.724   Cond. No.                         386.\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "Number of tests =  29\n",
      "p<sub>ASR</sub>=0.23\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEJCAYAAABohnsfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAywElEQVR4nO3de3QU5f0/8PfsNbvZcE3WxEDxRoxNNBGwNBWD2JYEJVUXv/4EK2qlFtoi2paKQEnhgKBF8Xj4xnqhlINKSVEJIA2iQb7hokhaoJEoF4FySUMCiWSz9935/bHZyW6ySWbJ3hLer3M4m52dzH42JPOeeZ5nnhFEURRBREQkgyLWBRARUe/B0CAiItkYGkREJBtDg4iIZGNoEBGRbKpYFxApHo8HLS0tUKvVEAQh1uUQEfUKoijC6XQiMTERCkXH84o+GxotLS04cuRIrMsgIuqVMjIykJSU1GF5nw0NtVoNwPvBNRpNWLZZXV2N7OzssGwrFlh/bLH+2GL98jgcDhw5ckTah7bXZ0PD1ySl0Wig1WrDtt1wbisWWH9ssf7YYv3yddasz45wIiKSjaFBRESyMTSIiEg2hgYREcnG0CAiItkYGkREJBtDgyhMDhw5jz++uRcHj9THuhSiiOmz12kQRdu6j77G4RMXYbW7kJOREutyiCKCZxpEYWK1uwIeifoihgYREcnG0CAiItkYGkREJBtDg4iIZGNoEBGRbAwNIiKSjaFBRESyMTSIiEg2hgYREckW0dAwm82YOHEizpw5AwD417/+hQcffBD33HMPfvOb38DhcAAAampqYDKZUFBQgHnz5sHl8l5Re+7cOTz88MMoLCzEjBkz0NLSEslyiYioGxELjYMHD2Ly5Mk4efIkAG+AzJw5E4sWLcKHH34IANiwYQMAYPbs2ViwYAG2bdsGURRRWloKAFi4cCGmTJmC8vJyZGdno6SkJFLlEhGRDBELjdLSUhQXF8NoNAIAdu/ejdzcXGRmZgIA5s+fjx//+Mc4e/YsbDYbcnNzAQAmkwnl5eVwOp344osvUFBQELCciIhiJ2Kz3C5ZsiTg+alTp6DX6/HMM8/gm2++wYgRIzBnzhwcPnwYKSltM4KmpKSgrq4OjY2NMBgMUKlUAcuJiCh2ojY1utvtxq5du7B+/XpcffXVmDdvHt544w384Ac/gCAI0nqiKEIQBOnRX/vnclRXV/e4dn9VVVVh3V60sf7IsVis0mNndcZz/XKw/tiKh/qjFhrJycnIycnB0KFDAQATJkzA22+/DZPJhPr6tpvWNDQ0wGg0YtCgQWhubobb7YZSqUR9fb3U1BWK7OxsaLXasHyGqqoqjBw5MizbigXWH1n6T3cATU7o9bqgdcZ7/d1h/bEVrfrtdnuXB9tRG3I7ZswYfPnll6itrQUA7NixA1lZWUhPT4dWq5UStKysDPn5+VCr1Rg1ahS2bt0KANi4cSPy8/OjVS4REQURtTONtLQ0LFq0CNOnT4fdbsdNN92EZ599FgCwfPlyzJ8/H2azGVlZWZg6dSoAoLi4GHPmzMFrr72GtLQ0vPzyy9Eql4iIgoh4aFRUVEhf33nnnbjzzjs7rJOZmSkNv/WXnp6OtWvXRrI8IiIKAa8IJyIi2RgaREQkG0ODiIhkY2gQEZFsDA0iIpKNoUFERLIxNIiISDaGBhERycbQICIi2RgaREQkG0ODiIhkY2gQEZFsDA0iIpKNoUFERLIxNIiISDaGBhERycbQICIi2RgaREQkG0ODiIhkY2gQEZFsDA0iIpKNoUFERLJFNDTMZjMmTpyIM2fOBCx/++238cgjj0jPa2pqYDKZUFBQgHnz5sHlcgEAzp07h4cffhiFhYWYMWMGWlpaIlkuERF1I2KhcfDgQUyePBknT54MWH7s2DG88cYbActmz56NBQsWYNu2bRBFEaWlpQCAhQsXYsqUKSgvL0d2djZKSkoiVS4REckQsdAoLS1FcXExjEajtMzhcGDBggV46qmnpGVnz56FzWZDbm4uAMBkMqG8vBxOpxNffPEFCgoKApYTEVHsqCK14SVLlnRY9tJLL2HSpEkYMmSItOz8+fNISUmRnqekpKCurg6NjY0wGAxQqVQBy0NVXV19GdV3rqqqKqzbizbWHzkWi1V67KzOeK5fDtYfW/FQf8RCo73du3ejtrYWzz33HD7//HNpucfjgSAI0nNRFCEIgvTor/1zObKzs6HVai+/cD9VVVUYOXJkWLYVC6w/svSf7gCanNDrdUHrjPf6u8P6Yyta9dvt9i4PtqMWGlu2bMHRo0dx7733wmKxoKGhAU8//TRmz56N+vp6ab2GhgYYjUYMGjQIzc3NcLvdUCqVqK+vD2jqIiKi6ItaaCxdulT6+vPPP8fKlSvxyiuvAAC0Wq2UomVlZcjPz4darcaoUaOwdetWFBUVYePGjcjPz49WuUREFERcXKexfPlyLF26FIWFhbBYLJg6dSoAoLi4GKWlpbj77ruxf/9+PP3007EtlIjoChfxM42KiooOy0aPHo3Ro0dLzzMzM7Fhw4YO66Wnp2Pt2rURrY+IiOSLizMNIiLqHRgaREQkG0ODiIhkY2gQEZFsDA0iIpKNoUFERLIxNIiISDaGBhERycbQICIi2RgaREQkG0ODiIhkY2gQEZFsDA0iIpKNoUFERLIxNIiISDaGBlEYWGxONLc4AADNLQ5YbM4YV0QUGQwNoh768psLeGzRR7hwyQYAuHDJhscWfYQvv7kQ48qIwo+hQdQDFpsTC9/6DFa7C6LoXSaKgNXukpYT9SUMDaIeqDxwDh5fWrTjEUVUHjgb5YqIIouhQdQDtQ1m2B3uoK/ZHW7UNpijXBFRZDE0iHogLdkArUYZ9DWtRom0ZEOUKyKKrIiGhtlsxsSJE3HmzBkAwPr16zFx4kQUFRXhueeeg8PhHW1SU1MDk8mEgoICzJs3Dy6Xtx343LlzePjhh1FYWIgZM2agpaUlkuUSheyO3KuhEISgrykEAXfkpke5IqLIilhoHDx4EJMnT8bJkycBACdOnMCqVavwt7/9DZs2bYLH48G7774LAJg9ezYWLFiAbdu2QRRFlJaWAgAWLlyIKVOmoLy8HNnZ2SgpKYlUuUSXRZ+gRvG070OnVcGXHYIA6LQqaTlRXxJSaDgcDnz77bdoamqS/nWmtLQUxcXFMBqNAACNRoPi4mIYDAYIgoCMjAycO3cOZ8+ehc1mQ25uLgDAZDKhvLwcTqcTX3zxBQoKCgKWE8WbrOsGY01xAQb3TwAADO6fgDXFBci6bnCMKyMKP9mHQevWrcPSpUvhdHovWhJFEYIgoKamJuj6S5YsCXienp6O9HTvqfrFixfxzjvvYOnSpTh//jxSUlKk9VJSUlBXV4fGxkYYDAaoVKqA5UTxSKdVIUmvQUOTDUl6Dc8wqM+S/Zu9atUqrFu3DllZWT16w7q6OkybNg2TJk3C6NGjUVVVBcGvTdgXRr5Hf+2fy1FdXd2jeturqqoK6/aijfVHjsVilR47qzOe65eD9cdWPNQvOzSSk5N7HBjHjx/HtGnT8Mgjj+BnP/sZACA1NRX19fXSOg0NDTAajRg0aBCam5vhdruhVCpRX18vNXWFIjs7G1qttkd1+1RVVWHkyJFh2VYssP7I0n+6A2hyQq/XBa0z3uvvDuuPrWjVb7fbuzzYlt2nMWbMGLz77ruoq6uT1afRntlsxhNPPIFZs2ZJgQF4m620Wq2UoGVlZcjPz4darcaoUaOwdetWAMDGjRuRn58v+/2IiCj8ZJ9pvPHGG3A4HFi0aJG0rKs+jfY2bNiAhoYGrF69GqtXrwYA3HXXXZg1axaWL1+O+fPnw2w2IysrC1OnTgUAFBcXY86cOXjttdeQlpaGl19+OZTPRkREYSY7NA4dOnRZb1BRUQEAeOyxx/DYY48FXSczMxMbNmzosDw9PR1r1669rPclIqLwk9085fF4sGrVKsyZMwdmsxmvv/463O7g0ycQEVHfJDs0XnzxRXz99dc4ePAgRFFEZWUlli5dGsnaiIgozsgOjb1792LZsmXQarVISkrCX/7yF+zevTuStRERUZyRHRoqlQoKRdvqGo1GuvCOiIiuDLL3+hkZGXjnnXfgdrvxzTff4K9//SsyMzMjWRsREcUZ2Wca8+bNw5dffokLFy5g8uTJaGlpwdy5cyNZGxERxRnZZxoGgwHPP/98JGshIqI4121oPPfcc12+zhFURERXjm6bp4YPH47hw4ejubkZX3/9NW688UZ897vfxcmTJ3mdBhHRFabbMw3fPFHbt2/HO++8A51OBwB48MEHpek+iIjoyiC7I/zChQvQaDTSc0EQ0NjYGJGiiIgoPsnuCM/Ly8O0adMwceJEiKKIsrIy3HXXXZGsjYiI4ozs0PjDH/6Ad955B9u3bwcATJgwAQ899FDECiMiovgjOzSeeOIJrFmzBo8++mgk6yEiojgmu0+jubkZFoslkrUQEVGck32modPpMG7cONx4443Q6/XS8j//+c8RKYyIiOKP7NB44IEHIlkHERH1ArJD4/77749kHURE1AvIDo1bb70VgiB0WP7Pf/4zrAUREVH8kh0aW7Zskb52OBz48MMPpavDiYjoyiB79FR6err079prr8Wvf/1rlJeXR7I2IiKKM7JDo73jx4/jwoULXa5jNpsxceJEnDlzBgCwZ88eFBUVYfz48VixYoW0Xk1NDUwmEwoKCjBv3jy4XC4AwLlz5/Dwww+jsLAQM2bMQEtLy+WWS0REYSA7NG699VaMGDECI0aMwK233op77723ywv9Dh48iMmTJ+PkyZMAAJvNhrlz56KkpARbt25FdXU1du7cCQCYPXs2FixYgG3btkEURZSWlgIAFi5ciClTpqC8vBzZ2dkoKSnpwUclIqKekh0aW7ZswebNm7F582Z8+OGH+Oyzz7oMjdLSUhQXF8NoNAIADh06hGHDhmHo0KFQqVQoKipCeXk5zp49C5vNhtzcXACAyWRCeXk5nE4nvvjiCxQUFAQsJyKi2JHdEZ6eno7Kykrs2bMHKpUK+fn5uO222zpdf8mSJQHPz58/j5SUFOm50WhEXV1dh+UpKSmoq6tDY2MjDAYDVCpVwHIiIood2aHx5z//GZs2bUJBQQE8Hg/mz5+PqVOn4uGHH5b1/R6PJ2DIriiKEASh0+W+R3/Bhvx2p7q6OuTv6UpVVVVYtxdtrD9yLBar9NhZnfFcvxysP7biof6QhtyWlpbCYDAA8N6cacqUKbJDIzU1FfX19dLz+vp6GI3GDssbGhpgNBoxaNAgNDc3w+12Q6lUSuuHKjs7G1qtNuTvC6aqqgojR44My7ZigfVHlv7THUCTE3q9Lmid8V5/d1h/bEWrfrvd3uXBtuw+Da1Wi8TEROl5//79Q9oZ5+Tk4MSJEzh16hTcbje2bNmC/Px8pKenQ6vVSglaVlaG/Px8qNVqjBo1Clu3bgUAbNy4Efn5+bLfj4iIwq/bM42PPvoIAHDttdfil7/8Jf7nf/4HSqUSGzduRHZ2tuw30mq1WLZsGWbOnAm73Y6xY8eisLAQALB8+XLMnz8fZrMZWVlZ0m1ki4uLMWfOHLz22mtIS0vDyy+/fDmfkYiIwqTb0Fi7dm3A89WrV0tfd3edBgBUVFRIX+fl5WHTpk0d1snMzMSGDRs6LE9PT+/w/kREFDshh0Ywy5Ytw5w5c8JSEBERxa/LviLc3+effx6OzRARUZwLS2iIohiOzRARUZwLS2hczvUTRETU+4QlNIiI6MrA0CAKE51WFfBI1BexT4MoTKaMz8TITCOmjM+MdSlEESP7kKilpQUnTpyATqfD0KFDodFopNeee+65iBRH1JvkZKQgJyOl+xWJIuDAkfPYuPM47h97g6zfQ1EU4fZ4/3k8Hng8IjwewG53dPl93YaG2+3G0qVLsX79ehgMBgiCAKvVip/+9Kf4zW9+A0EQMHr0aPmfjIiIwm7dR1/j8ImLsNpdyMlIaQ0DEW63B25RhNvl8YaEW4Tb410GABAB/7Yi303wOtNtaLz55puora3Fxx9/jKuuugoAcPbsWSxevBivv/46pk+fftkfkoiI5FEqlR3PDkRAbH1utjgBAM0tDvy3wQxP6/eFu/eg29D4xz/+gfXr1yMhIUFalp6ejhdeeAGPPvooQ4PoChdqswh1zu32wNUaCL6zAk/rY1OLG7UXWm953e7sAAA8rekgAnBHsJtZVp+Gf2D49OvXj9dnEFGHZhEKzuPxnSV42s4YWkPB5fbA7fZ4zw6CBAIA2B3OTs8arHYnWqzeM40WqxNWuxM6rToin6Pb0FAoOh9gxVFTRGS1uwIeryTBgsDjFuERvWcNogh4xNZOZt83dRIKl+vo6SasLD0Au9MNAGhstmPOyt349YO5GD50QBjfyavb0LDZbDh8+HDQgLDZbGEviIgoljweEaIoSs1EvlFFHo8Ij9h6diC2vtb6PbE6frbanQGB4WN3urGy9ABemDkGCZrwXjfU7dbsdjtmzpwZNDTYPEVEvYEoilAolLA5XEHOCETpbEAUI3dGEAn7a87D00mVHojYX1OHMTnpYX3PbkPj1Vdfle7Z3R5Dg4hiyXf07/ENLxVFaTSRx2+Ukcsj4kKzExe/tcV9EISivtECp9MT9DWn04P6RkvAMo8owmpzodnigNni9D5anWhucUhfe9wuFI3Udfqe3YbGU089FTQcRFGEIAj45JNPutsEEVG33K3XFHg8IkSI0nBSbzC0XYwm+kJCFNsCQMZZgdvt7lOBAQDJA3RQqQS4XME/2YEjDThZ29wWEBanNMqqMwMSlUBPQsP/zntERHL5jvZ9O3nfLl4U4XcW4O0jcHnc8IjoFU1CkeTxiGixOQPPBCxOmC0OnDx9CXuP/Vta3mxxoMXq9P7cOlF30YK6i5ZOX0/QKpGk08CgVyNJ731M7qcGEPzsBQhhGhEi6ltE0XsEL+3YRbF1py227sBbd/Oid0fvhgqXzHZpZJDb4+0LcLq8Oxiny4PaBnPb9v2+uFKDwOMRvc0/FgfMFgeaWwOg2eKE2ep9bG7xNguZW5uHuj4RsIb0/voEFfJvTYdBp0GSXo2kRI3360Q1DDoN1KqOo2NdLicazn3T6TYZGkRR0tXwdX9ut8c7SsfTthP3EX074PZNM35P/I/YRVH0fk/r+tJRv3+HLyBrx954yYrm1msBOtPVUW9f4PZ4Wo/8/foD/M4KAr92wmJ19igw9VqV9ywgUQPRacPVqYNbd/xtZwZJeg00agVWvPtPNJkdGJikhS5BhXP1LRjULwH3jb0hbJ8fYGhQnPINvPDfSQoKJdzuwNNmUVq/7VmwI7VgYza8Y+jb9pZiJ3/eflP04N9H6/HhnpO45/ZrkH19codCAnbeHkidtKIINNtEnL9oad2BB99Li75NXcFH59HkdnuC7Pid7c4K2r5usfXsWpTEBBUMeu9Rv+/Ru/P3/7r1UaeGUtl2oFFdXY3s7Js63bZBr0GT2YFEXWQu6vOJSWiUlZXhjTfeAADk5+fj2WefxZ49e7B06VLY7XZMmDABzzzzDACgpqYG8+bNQ0tLC0aNGoWFCxdCpWLWtddhJ9v2QusRJiAoVHC63AHriEF2mgHb6OKINmBdvx2rfy2+9whsqhADtiP6jqb9txHkMzY2O1HX2Hn77OXuZUP5tvUfH8GxM9/CbHFg6FX9Qnofq80Bp7vztmLqOZfbI/UBXGo3QujUmUuo/PqQt1moxfuapQcXJAoA9Dp1h519+x2/73n7EOitor73tVqtWLJkCcrLy9GvXz9MnjwZFRUVWLRoEdauXYu0tDT84he/wM6dOzF27FjMnj0bixcvRm5uLubOnYvS0lJMmTJF9vu53G4o/HaUov9hIwJ3lP5HowGrtT7xCGqYrY6gR5W+5+0WBe6c/YjSDlgM8j3tj5hFv3ra1x/42JWLzXbUNwa2ifamo1mX2x2zi6h8bA53wCNFltPl6aQPIPCMwHem0P1V6Z33CQiAd+fu29kntp4RtO8PaF0nUaeCUmaTY18S9dBwu93weDywWq3Q6/VwuVwwGAwYNmwYhg4dCgAoKipCeXk5brjhBthsNuTm5gIATCYTXn311ZBC48IlO1TK4E0aobr4rQXfmrueaz6eud2dXQZEFB1Olzvozr59f4DZ4kSz1QGb/fLDWRAgHekn6TVwOy1IT0tBP18wtDtDSExQQ6HovdeeJWiU0mMkD2qiHhoGgwGzZs3ChAkToNPpcNttt+H8+fNISWmb6MxoNKKurq7D8pSUFNTV1YX0fh2aa4gobBxON1ytTW5Wuwuf/btWahZqf4Zgtjh6tDNTCEKHHb1/34D/WUBSogb6BBUUfs0H3j6BG3v8mePVxDHXYfu+/+DH3/sONlQc7XQ9ocMX3lu4CoIAQQAUYtfBGfXQ+Oqrr/Dee+9hx44dSEpKwu9+9zucPHky4AJC34WDHo8n6PJQHDt6BK4wzhNcXV0dtm3FAuvvGd98azab7bJqiXX93XG6RFgdHu8/u6fD15v3VcLm8MBi98DmEOH0+9u68K0Nf/3wsOz3UghAgkYBvVYBnUaBBI0COo0AXetznUYR8LVWLQT5+3e2/gPgAuyXvP8udPKe8f7zB3zN5IL0tQABELz30/jqqxrvcggQFAIUgndnrxCABEHAT24zQIGLcDrsAACnw4Fzp7/xbk0QpZxoexTbNcuL3U5EG/XQ2LVrF/Ly8jB48GAA3ianVatWQalUSuvU19fDaDQiNTUV9fX10vKGhgYYjcaQ3u+G4RlQqcIzmsB7pJIdlm3FAuvvuYR9nwOXzEhISAi5lmjXL4oi7E53l6OB2l834OhkSopQpAzUIbm/LshZQeAZgk6riupURLH4/fHb/0MBQKEQoBAU3h29wvfce4Tfmg3SEb8vLHyvV//737glJwdC63a68u7/NQJNTuj1CbhtxC0h1Wy327sM16iHRmZmJv70pz/BYrFAp9OhoqICOTk52Lx5M06dOoUhQ4Zgy5YtmDRpEtLT06HValFVVYWRI0eirKwM+fn50S6ZKC6Iogibwx38IrGWdv0CVu+j78K7y6FSKgKaf1wOC4akpaBfogZatQLv7TguNU35u2R2YN7j3wv77Kqx4L/TB9rv+AN3+oIgQFAg4Lmy9XWFIthZUmg8HjeUcdDnEvX/1TFjxuDw4cMwmUxQq9W4+eabMXPmTNx+++2YOXMm7HY7xo4di8LCQgDA8uXLMX/+fJjNZmRlZWHq1KnRLpkoIkRRhM3ubu0DaAsAc4cO4raACLaTlkutUniP9HWdH/37Xy+g1SgDdnTeI/UMAEDlgbMQFACCdFFEanbVnlIohIAQ8B7VAwpBAYUgQKls28H7zgK8O35FazhwklYgRtdpPPnkk3jyyScDluXl5WHTpk0d1s3MzMSGDRuiVRpRp7q7O5ooirDYXR12+r7HM+e+xSfV/2q9hsAbBO4eXEKtUSs6HQ7a8eIxNbRqZdh2eqHOrhoJnTX9eHf63p28QtHW9j+oXwIGD9C1BQND4LL0/vNHogjxn0a65mQj3qs4Kh3pNzbb8dtXKjHkKgPcblE6Q+g+BDq/cZlWrZSuE5BGAyX6XSSWqAm4YEyrUXa6rUhLGaiHWq0IGhxqtQIpA/Uhb7NjCHgDwL+JRzoDgCCFg1Ju04/HBa06dj+zvoKhQVcMjyjCYnVK00Y0twS/YKxtZtGup5H2iCL+89/mTl9P0CjbLhTTa+BymDH06qtg0KvRL0gHsaYX7dBG3WTEhk+CD+tUQMCom66SnvvCQAFAqVRAqVB4g6CnIUAxwdCgXquzaaSb240U8p9KItxXlCsUQG6GETk3JHdoFlKrAkPA2ycQ3snjYkWnVWPWQ7fizY3/xqUWbzObSilgQJIWA5MScP6CBbdkpEDp6ytoDQrq/RgaFDfcHg9arM6gw0GbLQ6c/W8Tth2sku401mLrWQjofDOIBrtIzG8KiV0HzuD/DpwLug2PB0gZkIDR2WmXX0gc6aqJaEA/HfonaqQRQ8aBOoy40YiZy3egvsmKAUkJSO6fgK9ONWLj/x1H3i1Xx/CTXLksNu/fBwA0tzhgsTmhTwjfJIYMDYqYoNNI+91Wsn0TkbxppO2dvqJPUPlNDqdBv8S2eYT6JbbOJKprm0JCJXPyuKGp/aBW/zes7fexIvgFglKpgErpPQPwng0EjhZqf2agggsGvSZgmVrl7Yepb7IiSa+G3ekdTtX9HFAUCV9+cwEL3/oMNof353/hkg2PLfoIxdO+j6zrBoflPRgaJFvX00h3PDOwhG0aae9Rv8NmxrAhVwXOItrJNNLhFEr7fSz5zhIEQOo3UClbA6B1OCmbivoui82JhW99FhDYougN8IVvfYY1xQXQaXu+y2doXMF800h3mCxOOiNou+NYOKaRTtSpu7xKWJpcLlGNRJ26wwyi3j6B63v4qUOn06rx6wdzsbL0gHQkDXhHO/36wdyoXcTm36GsUASeJfiHATuSr0yVB851OnDDI4qoPHAW40cP6/H7MDT6kA7TSLcEXjF8rq4JW/+1P4RppDvnm0G0fedv+/4A32sGXe+eQXT40AF4YeYYLHzzMzQ22zEwSYvin38/5MCoOXkRH7dOKpd5zaCA16SmIwFQKpRQtTYZKZWtw04VCqgUQp+4JwOFX22DGfZOJoS0O9wBt+LtCYZGEF39YUeTbxrpgOaglo63mAxtGungfQL+00gb9Gqcq2+B2erEoH5aFHz/moAzhH56DfS9fBrpy5GgUSFRp0Zjsx2JOnVIgSEAUCmV2LLrGxw78y2cTjdGZl7lDQOl0DrKqG0oam/k8YjSxY+R6IClrqUlG6DVKIMGh1ajRFqyISzvw9AI4sPWP2y7wxXW0HA43R129kGnkW5xoNnq7PSoQQ6FQvC7l4D3aN9hvYRhQ9M63lZS33Ea6cV/+Rxmq/ePfuyIIeH4+H1aZ/0J/k1HA5PU0sV/TrcHA5K0sSw57P5T13bNSiQ6YKlrd+RejVWbgk80qBAE3JEbnmldGBpByL07m93hDnoz+WD3ETBbnAHt4aFSKIQgQ0M76R9I1EAfZAZRb5/AtZddw5XO/8cpCAISE9QBI498fQyd9SeInr55tz+Pp+O92SPRAUtd0yeoUTzt+9LoKd/dSBM0KhRP+37Y/g/4P9lKFEVvCFjbdu4tVie2fXZS2vHX1jVi8/59UpNRT2YQVSoEGPSaDncR62wCuWhPI32lad/JrFQo2voS/JqP1Cpvf4LvQjYCHF38HYSzA5a6l3XdYKwpLsAvX/wEDU02DO6fgJLf/zCsod3nQ+Pit1aYbZZOrhoOPENoP4NoY7MdH3x6vN0Wg9/utf000u2HhXqvG9BIs4wmaMM3eRzJI/g1H/k3HXlHHPlGIrGTOVQ3pPdHbUNL0NfC2QFL8ui03uuVGppsSNJrwn6W1+dD4+V3/4mmlstrFlAqBKQbDdJwULv1Eq79TlrgzKKtF4y1n0aaYsP3X6BqDQKVSgGVsu1rDkcNv5wMI/bV1EW8A5biQ58PDR9pGul2w0G9U0oHnhmUbDiIcw0tSEtOxNzHvidtw9sncE3sPkSUdDcFeKwIQtv9EBSCAJVfELQ1IbVd0RwJvqM2ttG3iVYHLMWHPv+b/7ufjkQ/gz6kGUR765DHcDh6uingIrbGZjvmrNyNXz+Yi+FDB0T8/dtCAVAplG3NR0oBg/slIGWgLqZXNE8Zn4kPdh7D/WP7xsSD4RCtDliKD33+f3NAUgJUqt4z5XQsWe3ODlc9A4Dd6cbK0gN4YeaYHl/9HPSq5vZ9C8pOQkF0dZg5NtpyMlKQk5ES0xriUTQ6YCk+8H+UJPtrzsPTyZSBodzCs0O/QrvrFXhVc98U6Q5Yig/8XyWJ3Ft4BmtCUikFKJQKaWiqSskOZ6K+iKFBEt8tPEWPCLdHhCh6+3cGGLTQJyiROWwwBvXTdt2ERER9GtsIrlACvM1ISgFQKxVIUKswbsQQpCcbMLi/rm1KEdF7d7wmswO351wNndZ7W1IGBtGViaHRTrDhpr2RfyhoVAroNCoMMOjQ36DBoH5aDB6gw1UD9UhNNsA4SI/BAxJw1eBETDfdgkstDmmKZRHeYZPzHx/NNuogDhw5jz++uRcHj9THupS4wCHJfV9MQqOiogImkwkTJkzA4sWLAQB79uxBUVERxo8fjxUrVkjr1tTUwGQyoaCgAPPmzYPLFbk7gh093YQ5K3ejsdk7E6xvuOnR000Re8/L5QsFhXSmoERigncG2oFJraEwKBGpyQakDNRjUP8EqBQuGHQa6LRqaNXKoJ3RvlEwg/snAAAG90/AmuICTjrXiXUffY2qr87j3Y++inUpcWHK+EyMzDRiyvjMWJdCERL10Dh9+jSKi4tRUlKCTZs24fDhw9i5cyfmzp2LkpISbN26FdXV1di5cycAYPbs2ViwYAG2bdsGURRRWloakbq6G27qu31itPnOFrSqtlDwnSkYB+qROjix9UxBhwFJWu9khQmtoXCZTUi+UTAAOAqmG757kvD2pl45GSn448/zOCy5D4t6aGzfvh133303UlNToVarsWLFCuh0OgwbNgxDhw6FSqVCUVERysvLcfbsWdhsNuTm5gIATCYTysvLI1KXnOGmkeJ/xqDXqtBPr8aAJC0G90+QmpCSB7aFgu9MQaVUcIQSEUVV1A8hT506BbVajenTp6O2thZ33nknhg8fjpSUtiMTo9GIuro6nD9/PmB5SkoK6upC23kfO3oELnfwMPBXc6S5y+Gmh4/8Bz+4KQnV1cGnS+iMIHjvzSwIrXdhUyigVHmnwFAqBQgQIQBQCCJEse1fpFRVVclaz2KxSo9yvyca4qkWIPSfU7z+XOXqjTX7u1Lqj+TvWdRDw+12Y//+/Vi7di30ej1mzJiBhISEgCNmURQhCAI8Hk/Q5aG4YXgGVKru501qdJ3Fv08fCRocarUC3834DoBGZGdnd3jdd92CUhACptT23WchXmZQraqqwsiRI2Wtq/90B9DkhF6vk/09kRZK/dESys+pqqoKer0u7n6ucsXjzz8UV1L9Pfn7tdvtXR4cRz00kpOTkZeXh0GDvHfE+9GPfoTy8nIolW3TQ9TX18NoNCI1NRX19W2jUhoaGmA0GiNS16ibjNjwydGgr6kUCtz23atw4vgl6Y5sar+rnL0334ncJHlERKGI5Ci2qB/6jhs3Drt27cKlS5fgdrtRWVmJwsJCnDhxAqdOnYLb7caWLVuQn5+P9PR0aLVa6fSqrKwM+fn5EalLp1Xjqf+XiwEGrdSBrFYqcE1aP/zhZ6ORnpKEQQY1UgcnInmADv2TtDDovf0LapWyzwUGh052z2JzornFe38V3z2xu2J3ekJan+hyRXIUW9T3CDk5OZg2bRqmTJkCp9OJ22+/HZMnT8Z1112HmTNnwm63Y+zYsSgsLAQALF++HPPnz4fZbEZWVhamTp3a4xp8zUmq1nmRVK1nDN/7bipG3GjEUy99ivomK/onafDizDukHaenj96uMxjO5tq1L7+5IM3qCnR/T+wvv7mAlz6ohbO1f4330KZIiuTEmjE5jHzggQfwwAMPBCzLy8vDpk2bOqybmZmJDRs2XPZ7qRQCElpHGilVgtS81NXII4Nejfom6xU93JSzuXbOYnNi4VufBQyz7eqe2L71HS5R1vpE8azPXxGePECHwb7mJF1bcxKHqtLlqjxwTrpivj3fPbF7sj5RPOvzoUEUbrUN5qC3NgWC3xM71PWJ4hlDgyhEackGaDXBbwYV7J7Yoa5PFM8YGkQhuiP36rZZgNsJdk/sUNcnimcMDaIQ+e6JrdOqpLsUCoJ3eHKwe2L71teoBFnrE8UzhgbRZQh1NuCs6wbjt/encfZg6vUYGkSXKdTZgLVqBWcPpl6PoUFERLIxNIiISDaGBhERycbQICIi2RgaREQkG0MjCE4LTnLxd4WuNAyNICI5Fz31LfxdoSsND4+C4LTgJBd/V+hKwzMNIiKSjaFBRESyMTSIiEg2hgYREcnG0CAiItkYGkREJFtMQ+OFF17AnDlzAAB79uxBUVERxo8fjxUrVkjr1NTUwGQyoaCgAPPmzYPL5YpVuUREV7yYhcbevXvxwQcfAABsNhvmzp2LkpISbN26FdXV1di5cycAYPbs2ViwYAG2bdsGURRRWloaq5KJiK54MQmNpqYmrFixAtOnTwcAHDp0CMOGDcPQoUOhUqlQVFSE8vJynD17FjabDbm5uQAAk8mE8vLyWJRMRESIUWgsWLAAzzzzDPr16wcAOH/+PFJS2q6qNRqNqKur67A8JSUFdXV1Ua+XiIi8oj6NyN///nekpaUhLy8P77//PgDA4/FAEARpHVEUIQhCp8tDUV1dHZ7CW1VVVYV1e9HG+mPL7bRJj73xs/TGmv2x/p6Lemhs3boV9fX1uPfee/Htt9/CYrHg7NmzUCqV0jr19fUwGo1ITU1FfX29tLyhoQFGozGk98vOzoZWqw1L7VVVVRg5cmRYthULrD+2qqqq8KRpFD7YeQz3j72h181Z1Rd+/qy/e3a7vcuD7aiHxurVq6Wv33//fezbtw8LFy7E+PHjcerUKQwZMgRbtmzBpEmTkJ6eDq1WK/2wysrKkJ+fH+2SicKGExxSbxcXs9xqtVosW7YMM2fOhN1ux9ixY1FYWAgAWL58OebPnw+z2YysrCxMnTo1xtUSEV25YhoaJpMJJpMJAJCXl4dNmzZ1WCczMxMbNmyIdmlERBQErwgnIiLZGBpERCQbQ4OIiGRjaBARkWxxMXoqEkRRBAA4HI6wbtdut4d1e9HG+mOL9ccW6++eb5/p24e2J4idvdLLNTc348iRI7Eug4ioV8rIyEBSUlKH5X02NDweD1paWqBWq0OeeoSI6EoliiKcTicSExOhUHTsweizoUFEROHHjnAiIpKNoUFERLIxNIiISDaGBhERycbQICIi2RgaREQkG0ODiIhkY2jIsHLlStxzzz2455578OKLL8a6nMv2wgsvYM6cObEuI2QVFRUwmUyYMGECFi9eHOtyQlZWVib9/rzwwguxLkc2s9mMiRMn4syZMwCAPXv2oKioCOPHj8eKFStiXF332te/fv16TJw4EUVFRXjuuefCPsVQuLWv3+ftt9/GI488EqOqGBrd2rNnD3bt2oUPPvgAGzduxJdffont27fHuqyQ7d27Fx988EGsywjZ6dOnUVxcjJKSEmzatAmHDx/Gzp07Y12WbFarFUuWLMHatWtRVlaG/fv3Y8+ePbEuq1sHDx7E5MmTcfLkSQCAzWbD3LlzUVJSgq1bt6K6ujqu/x/a13/ixAmsWrUKf/vb37Bp0yZ4PB68++67sS2yC+3r9zl27BjeeOON2BTViqHRjZSUFMyZMwcajQZqtRrXX389zp07F+uyQtLU1IQVK1Zg+vTpsS4lZNu3b8fdd9+N1NRUqNVqrFixAjk5ObEuSza32w2PxwOr1QqXywWXywWtVhvrsrpVWlqK4uJiGI1GAMChQ4cwbNgwDB06FCqVCkVFRSgvL49xlZ1rX79Go0FxcTEMBgMEQUBGRkZc/x23rx/wTiS4YMECPPXUUzGsrA/Pchsuw4cPl74+efIk/vGPf2DdunUxrCh0CxYswDPPPIPa2tpYlxKyU6dOQa1WY/r06aitrcWdd96Jp59+OtZlyWYwGDBr1ixMmDABOp0Ot912G0aMGBHrsrq1ZMmSgOfnz59HSkqK9NxoNKKuri7aZcnWvv709HSkp6cDAC5evIh33nkHS5cujUVpsrSvHwBeeuklTJo0CUOGDIlBRW14piHT0aNH8bOf/Qy///3vcc0118S6HNn+/ve/Iy0tDXl5ebEu5bK43W7s3bsXzz//PNavX49Dhw71qma2r776Cu+99x527NiByspKKBQKrFq1KtZlhczj8QRM/CmKYq+cCLSurg6PPvooJk2ahNGjR8e6HNl2796N2tpaTJo0KdalMDTkqKqqwmOPPYbf/va3uP/++2NdTki2bt2K3bt3495778Wrr76KiooKPP/887EuS7bk5GTk5eVh0KBBSEhIwI9+9CMcOnQo1mXJtmvXLuTl5WHw4MHQaDQwmUzYt29frMsKWWpqKurr66Xn9fX1AU0nvcHx48fx0EMP4f7778evfvWrWJcTki1btuDo0aO49957MX/+fFRXV8fsjJvNU92ora3Fr371K6xYsaJXHq2vXr1a+vr999/Hvn37MHfu3BhWFJpx48bh2WefxaVLl5CYmIjKykr88Ic/jHVZsmVmZuJPf/oTLBYLdDodKioqcPPNN8e6rJDl5OTgxIkTOHXqFIYMGYItW7bExVGvXGazGU888QSefvpp3HfffbEuJ2T+TWmff/45Vq5ciVdeeSUmtTA0urFq1SrY7XYsW7ZMWvbQQw9h8uTJMazqypGTk4Np06ZhypQpcDqduP3223vVzmrMmDE4fPgwTCYT1Go1br75Zjz55JOxLitkWq0Wy5Ytw8yZM2G32zF27FgUFhbGuizZNmzYgIaGBqxevVo6kLrrrrswa9asGFfW+/B+GkREJBv7NIiISDaGBhERycbQICIi2RgaREQkG0ODiIhk45BbojBzOp0YN24cMjMz8dZbb0nLDxw4gJdeeglNTU0QRRGpqal49tlnpalqbrzxRmRkZEChUEAQBFitVhgMBvzxj3/sldd2UN/E0CAKs+3btyMzMxPV1dU4fvw4rr/+ejgcDvziF7/AX/7yF2RlZQHwTpn+85//HJ988gmUSiUAYM2aNRg0aJC0rVWrVmHx4sVYv359TD4LUXsMDaIwW7duHe6++2585zvfwZo1a7Bo0SJYrVY0NzfDYrFI6/3kJz+BwWCA2+2WQsOfy+VCbW0t+vfvH83yibrEi/uIwujYsWO47777UFlZidOnT+ORRx7Bp59+ioEDB2L16tV45ZVXkJycjBEjRmD06NG45557oNPpALQ1TwFAY2MjtFotxo0bhxkzZmDw4MGx/FhEEnaEE4XRunXrMG7cOAwcOBC33HILhgwZgtLSUgDA448/jt27d2P+/PlISUnBm2++ifvuuw/Nzc3S969ZswabN2/G66+/DpvNhtGjRzMwKK7wTIMoTCwWC/Lz86HRaJCQkADAO1GeVqvFSy+9hEOHDmHatGnS+i6XCxMnTsTTTz+NwsJC3Hjjjdi7d6/Up7FlyxYUFxejrKws5vdQIPLhmQZRmGzevBkDBgxAZWUlKioqUFFRgY8//hgWiwX79u3Da6+9hv3790vr19fXw2w2S01S7U2cOBG33HJLXN8siK487AgnCpN169bh8ccfD+jU7tevHx555BHs2LED//u//4sVK1bgv//9L7RaLZKSkvD888/juuuu63Sbf/jDH/CTn/wElZWVuOOOO6LxMYi6xOYpIiKSjc1TREQkG0ODiIhkY2gQEZFsDA0iIpKNoUFERLIxNIiISDaGBhERycbQICIi2f4/YMR2VOdTbBYAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# used for p and r^2 calculation\n",
    "if only_czech_republic:\n",
    "    dataframe_to_regress = merged_sigs_with_metadata.dropna(subset=['ASR'])\n",
    "    countries_with_ASR = counties_with_ASR.copy()\n",
    "    countries_with_ASR.replace(' ', '<br>', regex=True, inplace=True)\n",
    "else:\n",
    "    dataframe_to_regress = merged_sigs_with_metadata.dropna(subset=['ASR'])\n",
    "#     dataframe_to_regress = dataframe_to_regress.replace('Missing', np.nan)\n",
    "#     dataframe_to_regress = dataframe_to_regress.dropna(subset=['stage'])\n",
    "    dataframe_to_regress['ASR'] = pd.to_numeric(dataframe_to_regress['ASR'])\n",
    "\n",
    "# arrange outputs\n",
    "if only_czech_republic:\n",
    "    output_folder = 'incidence/%s/ASR_vs_sigs/Czechia/%s/' % (run_name, analysis)\n",
    "elif drop_AA_countries:\n",
    "    output_folder = 'incidence/%s/ASR_vs_sigs/noBalkans/%s/' % (run_name, analysis)\n",
    "else:\n",
    "    output_folder = 'incidence/%s/ASR_vs_sigs/%s/' % (run_name, analysis)\n",
    "if linear_fit_over_countries:\n",
    "    output_folder = output_folder.replace('ASR_vs_sigs','ASR_vs_sigs/fit_over_countries')\n",
    "elif adjust_regressions:\n",
    "    output_folder = output_folder.replace('ASR_vs_sigs','ASR_vs_sigs_adjusted')\n",
    "    if drop_thailand_in_adjusted_regressions:\n",
    "        output_folder = output_folder.replace('ASR_vs_sigs','ASR_vs_sigs_noThai')\n",
    "if relative_attributions:\n",
    "    output_folder = output_folder.replace('ASR_vs_sigs','ASR_vs_sigs_relative')\n",
    "\n",
    "make_folder_if_not_exists(output_folder)\n",
    "mean_or_median_label = 'median' if show_median else 'mean'\n",
    "results_dataframe = pd.DataFrame(columns=['Signature', '2.5%', '97.5%', 'OR', 'p-value', 'p-value (corr)', 'Regression type'])\n",
    "results_dataframe = results_dataframe.set_index(['Signature'])\n",
    "results_dataframe['Regression type'] = 'linear'\n",
    "\n",
    "# function to improve positions of labels\n",
    "def improve_text_position(x):\n",
    "    \"\"\" it is more efficient if the x values are sorted \"\"\"\n",
    "    positions = ['top center', 'bottom center'] #\n",
    "    return [positions[i % len(positions)] for i in range(len(x))]\n",
    "\n",
    "for signature in signatures_with_at_least_5_cases + [mtype + '_burden' for mtype in mutation_types]:\n",
    "    if analysis == 'COSMIC':\n",
    "        signature_name_to_plot = signature.replace('SBS1536A','SBS40b')\n",
    "        signature_name_to_plot = signature_name_to_plot.replace('SBS1536B','SBS40a')\n",
    "        signature_name_to_plot = signature_name_to_plot.replace('SBS1536F','SBS40c')\n",
    "        signature_name_to_plot = signature_name_to_plot.replace('SBS22','SBS22a')\n",
    "        signature_name_to_plot = signature_name_to_plot.replace('SBS1536I','SBS22b')\n",
    "        signature_name_to_plot = signature_name_to_plot.replace('SBS1536I','SBS22b')\n",
    "        signature_name_to_plot = signature_name_to_plot.replace('DBS78D','DBS20')\n",
    "        signature_name_to_plot = signature_name_to_plot.replace('DBS78C','DBS_C')\n",
    "        signature_name_to_plot = signature_name_to_plot.replace('ID83C','ID23')\n",
    "    else:\n",
    "        signature_name_to_plot = signature\n",
    "    signature_label = \"%s %s attribution\" % (signature_name_to_plot, mean_or_median_label)\n",
    "    if relative_attributions:\n",
    "        signature_label = \"%s %s relative attribution\" % (signature_name_to_plot, mean_or_median_label)\n",
    "    if 'burden' in signature:\n",
    "        signature_label = signature.replace('_', ' ').replace('burden', '%s total mutation burden' % mean_or_median_label)\n",
    "    fig = px.scatter(countries_with_ASR, x=\"ASR\", y=signature,\n",
    "                     text=\"county\" if only_czech_republic else \"country\",\n",
    "                     size=\"N\", color=signature + '_freq',\n",
    "                     color_continuous_scale=\"Viridis_r\",\n",
    "                     error_y=signature + '_plus_error' if show_error_bars else None,\n",
    "                     error_y_minus=signature + '_minus_error' if show_error_bars else None,\n",
    "                     labels={\"ASR\": \"ASR (kidney cancer)\",\n",
    "                     signature: signature_label}\n",
    "                    )\n",
    "    \n",
    "    # remove warnings\n",
    "    pio.full_figure_for_development(fig, warn=False)\n",
    "    \n",
    "    # retrieve model estimates from linear fit\n",
    "    if linear_fit_over_countries:\n",
    "#         model = px.get_trendline_results(fig)\n",
    "#         results = model.iloc[0][\"px_fit_results\"]\n",
    "        results = smf.ols(formula=\"%s ~ ASR\" % signature, data=countries_with_ASR).fit()\n",
    "        alpha = results.params[0]\n",
    "        beta = results.params[1]\n",
    "        slope = beta\n",
    "        intercept = alpha\n",
    "        pvalue = results.pvalues[1]\n",
    "        r_squared = results.rsquared\n",
    "        rvalue = np.sqrt(r_squared)\n",
    "    else:\n",
    "        # regress data with all cases\n",
    "        dataframe_to_regress_locally = dataframe_to_regress.dropna(subset=[signature])\n",
    "        if only_czech_republic:\n",
    "            slope, intercept, rvalue, pvalue, stderr = stats.linregress(x=dataframe_to_regress_locally['ASR'], y=dataframe_to_regress_locally[signature + '_CI' if not 'burden' in signature else signature])\n",
    "        else:\n",
    "            slope, intercept, rvalue, pvalue, stderr = stats.linregress(x=dataframe_to_regress_locally['ASR'], y=dataframe_to_regress_locally[signature + '_CI' if not 'burden' in signature else signature])\n",
    "    \n",
    "    summary = f'r={rvalue:.2f}, p={pvalue:.2g}'\n",
    "    if adjust_regressions and not linear_fit_over_countries:\n",
    "        dataframe_to_regress_locally = dataframe_to_regress.dropna(subset=[signature])\n",
    "        # drop 5 thai cases\n",
    "        if drop_thailand_in_adjusted_regressions and not only_czech_republic:\n",
    "            dataframe_to_regress_locally = dataframe_to_regress_locally[dataframe_to_regress_locally.country != 'Thailand'].copy()\n",
    "        # calculate signature frequency\n",
    "        signature_frequency = 1\n",
    "        if not 'burden' in signature:\n",
    "            # check if DBS frequency is correct\n",
    "            signature_frequency = len(dataframe_to_regress_locally.query('%s > 0' % (signature + '_CI')))/len(dataframe_to_regress_locally.index)\n",
    "        if log_transform and signature_frequency>=0.75:\n",
    "            if 'burden' in signature:\n",
    "                dataframe_to_regress_locally.loc[ dataframe_to_regress_locally[signature] == 0, signature] = 1\n",
    "                dataframe_to_regress_locally[signature] = np.log(dataframe_to_regress_locally[signature]).copy()\n",
    "            else:\n",
    "                dataframe_to_regress_locally.loc[ dataframe_to_regress_locally[signature + '_CI'] == 0, signature  + '_CI'] = 1\n",
    "                dataframe_to_regress_locally[signature + '_CI'] = np.log(dataframe_to_regress_locally[signature + '_CI']).copy()               \n",
    "        logit = False\n",
    "        if 'burden' in signature:\n",
    "            if use_robust_regressions:\n",
    "                results = smf.rlm(formula=\"%s ~ ASR + %s age_diag\" % (signature, 'sex + ' if not (males_only or females_only) else ''), data=dataframe_to_regress_locally, M=sm.robust.norms.HuberT()).fit()\n",
    "            else:\n",
    "                results = smf.ols(formula=\"%s ~ ASR + %s age_diag\" % (signature, 'sex + ' if not (males_only or females_only) else ''), data=dataframe_to_regress_locally).fit()\n",
    "        else:\n",
    "            if signature_frequency>=0.75:\n",
    "                if use_robust_regressions:\n",
    "                    results = smf.rlm(formula=\"%s_CI ~ ASR + %s age_diag\" % (signature, 'sex + ' if not (males_only or females_only) else ''), data=dataframe_to_regress_locally, M=sm.robust.norms.HuberT()).fit()\n",
    "                else:\n",
    "                    results = smf.ols(formula=\"%s_CI ~ ASR + %s age_diag\" % (signature, 'sex + ' if not (males_only or females_only) else ''), data=dataframe_to_regress_locally).fit()\n",
    "            else:\n",
    "                # run logistic regression\n",
    "                logit = True\n",
    "                dataframe_to_regress_locally[signature + '_bool'] = dataframe_to_regress_locally[signature + '_CI'].gt(0).copy()\n",
    "                dataframe_to_regress_locally[signature + '_bool'] = dataframe_to_regress_locally[signature + '_bool'].astype(int)\n",
    "                try:\n",
    "                    results = smf.logit(formula=\"%s_bool ~ ASR + %s age_diag\" % (signature, 'sex + ' if not (males_only or females_only) else ''), data=dataframe_to_regress_locally).fit(cov_type=\"hc3\" if use_robust_regressions else \"nonrobust\")\n",
    "                except Exception as e:\n",
    "                    print('* WARNING: Logistic regression for model %s ~ ASR + %s age_diag failed' % (signature + '_bool', 'sex + ' if not (males_only or females_only) else ''))\n",
    "                    print(e)\n",
    "                    continue\n",
    "        params = results.params\n",
    "        if signature in ['SBS12','SBS22','SBS1536I','SBS1536A','SBS1536B','SBS4','ID5','SBS_burden','ID_burden']:\n",
    "            print('Freq:',signature_frequency)\n",
    "            print('Logit:',logit)\n",
    "            print(results.summary())\n",
    "        # ORs with confidence intervals\n",
    "        conf = results.conf_int()\n",
    "        conf['OR'] = params\n",
    "        conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "        if logit:\n",
    "            conf = np.exp(conf)\n",
    "        pvalues_dataframe = results.pvalues.to_frame()\n",
    "        conf['p-value'] = pvalues_dataframe.copy()\n",
    "        # Bonferroni adjustment based on the number of signatures or mutation types\n",
    "        number_of_tests = 1\n",
    "        number_of_tests = len(signatures_with_at_least_5_cases) + len(mutation_types)\n",
    "        print('Number of tests = ', number_of_tests)\n",
    "        corrected_pvalues_dataframe = pvalues_dataframe*number_of_tests\n",
    "        corrected_pvalues_dataframe[corrected_pvalues_dataframe > 1] = 1\n",
    "        conf['p-value (corr)'] = corrected_pvalues_dataframe\n",
    "        results_dataframe.loc[signature,:] = conf.loc['ASR']\n",
    "        results_dataframe.loc[signature, 'Regression type'] = 'logistic' if logit else 'linear'\n",
    "#         intercept = results.params[0]\n",
    "#         slope = results.params['ASR']\n",
    "        pvalue_adj = results.pvalues['ASR']\n",
    "        if not logit and show_r_squared:\n",
    "            r_squared_adj = results.rsquared\n",
    "            summary = f'R<sup>2</sup>={r_squared_adj:.2f}<br>p<sub>ASR</sub>={pvalue_adj:.2g}'\n",
    "        else:\n",
    "            summary = f'p<sub>ASR</sub>={pvalue_adj:.2g}' # (all cases)'\n",
    "#         rvalue_adj = np.sqrt(r_squared_adj)\n",
    "#     print(results_dataframe.loc[signature])\n",
    "    # labels\n",
    "    fig.layout.coloraxis.colorbar.title = 'Signature frequency'\n",
    "    fig.layout.coloraxis.colorbar.titleside = 'right'\n",
    "    fig.layout.coloraxis.colorbar.lenmode = 'fraction'\n",
    "    fig.layout.coloraxis.colorbar.len = 0.75\n",
    "    fig.update_traces(textposition='top center')\n",
    "    if only_czech_republic:\n",
    "        fig.update_layout(title_text='ASR/%s association across Czech counties' % (signature_name_to_plot.replace('_', ' ')), title_x=0.5)\n",
    "    else:\n",
    "        fig.update_layout(title_text='ASR/%s association across countries' % (signature_name_to_plot.replace('_', ' ')), title_x=0.5)\n",
    "    if sex_dependent_ASR:\n",
    "        if only_czech_republic:\n",
    "            fig.update_layout(title_text='Sex-specific ASR/%s association across Czech counties' % (signature_name_to_plot.replace('_', ' ')), title_x=0.5)\n",
    "        else:\n",
    "            fig.update_layout(title_text='Sex-specific ASR/%s association across countries' % (signature_name_to_plot.replace('_', ' ')), title_x=0.5)\n",
    "    \n",
    "    if males_only:\n",
    "        if only_czech_republic:\n",
    "            fig.update_layout(title_text='Males: ASR/%s association across Czech counties' % (signature_name_to_plot.replace('_', ' ')), title_x=0.5)\n",
    "        else:\n",
    "            fig.update_layout(title_text='Males: ASR/%s association across countries' % (signature_name_to_plot.replace('_', ' ')), title_x=0.5)\n",
    "    elif females_only:\n",
    "        if only_czech_republic:\n",
    "            fig.update_layout(title_text='Females: ASR/%s association across Czech counties' % (signature_name_to_plot.replace('_', ' ')), title_x=0.5)\n",
    "        else:\n",
    "            fig.update_layout(title_text='Females: ASR/%s association across countries' % (signature_name_to_plot.replace('_', ' ')), title_x=0.5)\n",
    "        \n",
    "    \n",
    "    # draw the slope line\n",
    "    fig.add_trace(go.Scatter(x=dataframe_to_regress['ASR'], y=slope*dataframe_to_regress['ASR']+intercept, mode='lines', line_shape='linear', showlegend=False))\n",
    "    \n",
    "    # set background to transparent\n",
    "    fig.update_layout({\n",
    "    'plot_bgcolor': 'rgba(0,0,0,0)',\n",
    "    'paper_bgcolor': 'rgba(0,0,0,0)'\n",
    "    })\n",
    "    \n",
    "    # set text size\n",
    "    fig.update_layout(\n",
    "    font=dict(\n",
    "        family=\"Arial\",\n",
    "        size=18\n",
    "    )\n",
    "    )\n",
    "\n",
    "    # add confidence band using regplot\n",
    "    plt.clf()\n",
    "    if not linear_fit_over_countries:\n",
    "        rg=sns.regplot(x = 'ASR', # Horizontal axis\n",
    "                       y = signature + '_CI' if not 'burden' in signature else signature, # Vertical axis\n",
    "                       data=dataframe_to_regress, # Data source\n",
    "                       fit_reg=True, \n",
    "                       x_estimator=np.mean,\n",
    "                       truncate=False)\n",
    "    else:\n",
    "        rg=sns.regplot(x = 'ASR', # Horizontal axis\n",
    "                       y = signature, # Vertical axis\n",
    "                       data=countries_with_ASR, # Data source\n",
    "                       fit_reg=True, \n",
    "                       x_estimator=np.mean,\n",
    "                       truncate=False)\n",
    "\n",
    "    X=rg.get_lines()[0].get_xdata()# x-coordinate of points along the regression line\n",
    "    Y=rg.get_lines()[0].get_ydata()# y-coordinate\n",
    "    P=rg.get_children()[1].get_paths()#The list of Path(s) bounding the shape of 95% confidence interval-transparent\n",
    "    p_codes={1:'M', 2: 'L', 79: 'Z'}#dict to get the Plotly codes for commands to define the svg path\n",
    "    path=''\n",
    "    for s in P[0].iter_segments():\n",
    "        c=p_codes[s[1]]\n",
    "        xx, yy=s[0]\n",
    "        path+=c+str('{:.5f}'.format(xx))+' '+str('{:.5f}'.format(yy))\n",
    "\n",
    "    shapes=[dict(type='path',\n",
    "                 path=path,\n",
    "                 line=dict(width=0.1,color='rgba(68, 122, 219, 0.25)' ),\n",
    "                 fillcolor='rgba(68, 122, 219, 0.25)')]   \n",
    "\n",
    "    fig.update_layout(shapes=shapes)\n",
    "    offset = 1.5 if only_czech_republic else 3.8\n",
    "    if drop_AA_countries:\n",
    "        offset = 2.5\n",
    "    fig.update_layout(xaxis_range=[min(dataframe_to_regress['ASR'])-offset,max(dataframe_to_regress['ASR'])+offset])\n",
    "    if not 'burden' in signature and relative_attributions:\n",
    "        fig.update_layout(yaxis_range=[min(dataframe_to_regress.groupby('country')[signature + '_CI'].mean())*0.9-0.01,max(dataframe_to_regress.groupby('country')[signature + '_CI'].mean())*1.2])\n",
    "#     fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')\n",
    "#     fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')\n",
    "#     fig.update_xaxes(showline=True, linewidth=1, linecolor='lightgrey')\n",
    "#     fig.update_yaxes(showline=True, linewidth=1, linecolor='lightgrey')\n",
    "    fig.update_xaxes(showline=True, linewidth=2, linecolor='black', showgrid=False, zeroline=False)\n",
    "    fig.update_yaxes(showline=True, linewidth=2, linecolor='black', showgrid=False, zeroline=False)\n",
    "\n",
    "    full_fig = fig.full_figure_for_development(warn=False)\n",
    "    x_range = full_fig.layout.xaxis.range\n",
    "    y_range = full_fig.layout.yaxis.range\n",
    "    \n",
    "    print(summary)\n",
    "    fig.add_annotation(\n",
    "            x = x_range[1]*0.65 if only_czech_republic else x_range[1]*0.2,\n",
    "            y = y_range[1]*0.9 if relative_attributions else y_range[1]-0.3,\n",
    "            xref=\"x\",\n",
    "            yref=\"y\",\n",
    "            text=summary,\n",
    "            showarrow=False,\n",
    "            font=dict(\n",
    "                family=\"Arial\",\n",
    "                size=18,\n",
    "                color=\"#000000\"\n",
    "                ),\n",
    "            align=\"left\",\n",
    "            arrowhead=2,\n",
    "            arrowsize=1,\n",
    "            arrowwidth=2,\n",
    "            arrowcolor=\"#636363\",\n",
    "            ax=20,\n",
    "            ay=-30,\n",
    "            borderwidth=2,\n",
    "            borderpad=4,\n",
    "            opacity=0.8\n",
    "            )\n",
    "    fig.data = (fig.data[1],fig.data[0])\n",
    "    text_positions = improve_text_position(countries_with_ASR['ASR']) \n",
    "    if only_czech_republic:\n",
    "        text_positions = ['middle right', # Olomouc\n",
    "                          'middle left', # Prague\n",
    "                          'middle left', # Brno\n",
    "                          'top left', # Ceske\n",
    "                         ]\n",
    "    elif relative_attributions and 'SBS40b' in signature_name_to_plot:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle right', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'bottom right', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle left', # Poland\n",
    "                          'middle right', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'bottom left', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif relative_attributions and 'SBS40a' in signature_name_to_plot:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle right', # Czechia\n",
    "                          'bottom right', # Canada\n",
    "                          'middle right', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle left', # Poland\n",
    "                          'middle right', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle right', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif relative_attributions and 'ID1' in signature_name_to_plot:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle right', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'middle right', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle right', # Poland\n",
    "                          'middle left', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle left', # Serbia\n",
    "                          'middle right', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif relative_attributions and 'ID5' in signature_name_to_plot:\n",
    "        text_positions = ['bottom right', # Lithuania\n",
    "                          'top right', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'bottom right', # Russia\n",
    "                          'top right', # UK\n",
    "                          'bottom left', # Poland\n",
    "                          'bottom right', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle left', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif 'CN' in signature_name_to_plot:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle right', # Czechia\n",
    "                          'top right', # Canada\n",
    "                          'bottom right', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle right', # Poland\n",
    "                          'middle left', # Romania\n",
    "                          'middle right', # Japan\n",
    "                          'middle left', # Serbia\n",
    "                          'middle right', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif 'SV' in signature_name_to_plot:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle right', # Czechia\n",
    "                          'top right', # Canada\n",
    "                          'top right', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle right', # Poland\n",
    "                          'middle left', # Romania\n",
    "                          'middle right', # Japan\n",
    "                          'middle left', # Serbia\n",
    "                          'middle right', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif not relative_attributions and 'SBS40a' in signature_name_to_plot:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle right', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'middle right', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle right', # Poland\n",
    "                          'middle left', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle left', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif not relative_attributions and 'SBS40b' in signature_name_to_plot:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle right', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'middle right', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'top left', # Poland\n",
    "                          'top left', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle left', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif not relative_attributions and 'ID5' in signature_name_to_plot:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle right', # Czechia\n",
    "                          'top right', # Canada\n",
    "                          'middle right', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle right', # Poland\n",
    "                          'middle left', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle left', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif not relative_attributions and 'ID8' in signature_name_to_plot:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle left', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'bottom right', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle left', # Poland\n",
    "                          'middle left', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle right', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif 'ID_burden' in signature_name_to_plot and drop_AA_countries:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle right', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'middle left', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle right', # Poland\n",
    "                          'middle right', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle right', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif 'burden' in signature_name_to_plot and drop_AA_countries:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle right', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'middle right', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle right', # Poland\n",
    "                          'middle right', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle right', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif 'SBS_burden' in signature_name_to_plot:\n",
    "        text_positions = ['bottom right', # Lithuania\n",
    "                          'top center', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'bottom right', # Russia\n",
    "                          'top right', # UK\n",
    "                          'middle left', # Poland\n",
    "                          'middle right', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle right', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif 'DBS_burden' in signature_name_to_plot:\n",
    "        text_positions = ['bottom right', # Lithuania\n",
    "                          'top center', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'middle right', # Russia\n",
    "                          'middle right', # UK\n",
    "                          'middle left', # Poland\n",
    "                          'middle right', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle right', # Serbia\n",
    "                          'bottom left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    elif 'ID_burden' in signature_name_to_plot:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'top center', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'middle left', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle right', # Poland\n",
    "                          'middle right', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'top left', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    else:\n",
    "        text_positions = ['middle right', # Lithuania\n",
    "                          'middle right', # Czechia\n",
    "                          'middle right', # Canada\n",
    "                          'middle right', # Russia\n",
    "                          'middle left', # UK\n",
    "                          'middle left', # Poland\n",
    "                          'middle right', # Romania\n",
    "                          'middle left', # Japan\n",
    "                          'middle right', # Serbia\n",
    "                          'middle left', # Brazil \n",
    "                          'middle left' # Thailand\n",
    "                         ]\n",
    "    fig.update_traces(textposition=text_positions)\n",
    "    \n",
    "    fig.write_image(output_folder + 'ASR_%s.pdf' % (signature_name_to_plot))    \n",
    "#     fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.367187\n",
      "         Iterations 7\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:          SBS1536A_bool   No. Observations:                  961\n",
      "Model:                          Logit   Df Residuals:                      957\n",
      "Method:                           MLE   Df Model:                            3\n",
      "Date:                Tue, 06 Feb 2024   Pseudo R-squ.:                 0.07793\n",
      "Time:                        17:16:29   Log-Likelihood:                -352.87\n",
      "converged:                       True   LL-Null:                       -382.69\n",
      "Covariance Type:            nonrobust   LLR p-value:                 6.998e-13\n",
      "===============================================================================\n",
      "                  coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept      -2.0342      0.569     -3.572      0.000      -3.150      -0.918\n",
      "sex[T.Male]     0.2567      0.197      1.301      0.193      -0.130       0.643\n",
      "ASR             0.1931      0.031      6.131      0.000       0.131       0.255\n",
      "age_diag        0.0320      0.008      3.929      0.000       0.016       0.048\n",
      "===============================================================================\n",
      "               2.5%   97.5%      OR  p-value\n",
      "Intercept   4.3e-02 4.0e-01 1.3e-01  3.5e-04\n",
      "sex[T.Male] 8.8e-01 1.9e+00 1.3e+00  1.9e-01\n",
      "ASR         1.1e+00 1.3e+00 1.2e+00  8.7e-10\n",
      "age_diag    1.0e+00 1.0e+00 1.0e+00  8.5e-05\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:            SBS1536A_CI   R-squared:                       0.241\n",
      "Model:                            OLS   Adj. R-squared:                  0.239\n",
      "Method:                 Least Squares   F-statistic:                     101.5\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           4.56e-57\n",
      "Time:                        17:16:29   Log-Likelihood:                -7639.6\n",
      "No. Observations:                 961   AIC:                         1.529e+04\n",
      "Df Residuals:                     957   BIC:                         1.531e+04\n",
      "Df Model:                           3                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept   -1196.2149    137.314     -8.712      0.000   -1465.686    -926.744\n",
      "sex[T.Male]   233.1231     45.549      5.118      0.000     143.736     322.510\n",
      "ASR            62.2369      7.053      8.825      0.000      48.396      76.077\n",
      "age_diag       26.2063      1.897     13.815      0.000      22.484      29.929\n",
      "==============================================================================\n",
      "Omnibus:                       81.338   Durbin-Watson:                   1.829\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              137.915\n",
      "Skew:                           0.590   Prob(JB):                     1.13e-30\n",
      "Kurtosis:                       4.433   Cond. No.                         386.\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "               2.5%    97.5%       OR  p-value\n",
      "Intercept   0.0e+00  0.0e+00  0.0e+00  1.3e-17\n",
      "sex[T.Male] 2.7e+62 1.2e+140 1.8e+101  3.7e-07\n",
      "ASR         1.0e+21  1.1e+33  1.1e+27  5.1e-18\n",
      "age_diag    5.8e+09  1.0e+13  2.4e+11  1.0e-39\n",
      "                 Generalized Linear Model Regression Results                  \n",
      "==============================================================================\n",
      "Dep. Variable:            SBS1536A_CI   No. Observations:                  961\n",
      "Model:                            GLM   Df Residuals:                      957\n",
      "Model Family:                 Poisson   Df Model:                            3\n",
      "Link Function:                    Log   Scale:                          1.0000\n",
      "Method:                          IRLS   Log-Likelihood:            -2.3470e+05\n",
      "Date:                Tue, 06 Feb 2024   Deviance:                   4.6199e+05\n",
      "Time:                        17:16:29   Pearson chi2:                 3.80e+05\n",
      "No. Iterations:                     5   Pseudo R-squ. (CS):              1.000\n",
      "Covariance Type:                  HC1                                         \n",
      "===============================================================================\n",
      "                  coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept       4.8743      0.128     38.196      0.000       4.624       5.124\n",
      "sex[T.Male]     0.2105      0.038      5.489      0.000       0.135       0.286\n",
      "ASR             0.0555      0.007      8.187      0.000       0.042       0.069\n",
      "age_diag        0.0237      0.002     14.460      0.000       0.020       0.027\n",
      "===============================================================================\n",
      "               2.5%   97.5%      OR  p-value\n",
      "Intercept   1.0e+02 1.7e+02 1.3e+02  0.0e+00\n",
      "sex[T.Male] 1.1e+00 1.3e+00 1.2e+00  4.0e-08\n",
      "ASR         1.0e+00 1.1e+00 1.1e+00  2.7e-16\n",
      "age_diag    1.0e+00 1.0e+00 1.0e+00  2.2e-47\n",
      "            Mixed Linear Model Regression Results\n",
      "==============================================================\n",
      "Model:               MixedLM  Dependent Variable:  SBS1536A_CI\n",
      "No. Observations:    961      Method:              REML       \n",
      "No. Groups:          11       Scale:               445720.5029\n",
      "Min. group size:     5        Log-Likelihood:      -7607.0159 \n",
      "Max. group size:     259      Converged:           Yes        \n",
      "Mean group size:     87.4                                     \n",
      "--------------------------------------------------------------\n",
      "              Coef.   Std.Err.   z    P>|z|   [0.025   0.975] \n",
      "--------------------------------------------------------------\n",
      "Intercept   -1191.862  250.027 -4.767 0.000 -1681.906 -701.818\n",
      "sex[T.Male]   247.649   44.667  5.544 0.000   160.102  335.195\n",
      "ASR            54.408   22.454  2.423 0.015    10.399   98.417\n",
      "age_diag       26.551    1.898 13.987 0.000    22.830   30.271\n",
      "Group Var   47326.750   41.562                                \n",
      "==============================================================\n",
      "\n",
      "[0.00039448]\n",
      "65.24639612065585 6.609495206384084e-16\n",
      "               2.5%    97.5%       OR  p-value\n",
      "Intercept   0.0e+00 1.6e-305  0.0e+00  1.9e-06\n",
      "sex[T.Male] 3.4e+69 3.7e+145 3.6e+107  3.0e-08\n",
      "ASR         3.3e+04  5.5e+42  4.3e+23  1.5e-02\n",
      "age_diag    8.2e+09  1.4e+13  3.4e+11  1.9e-44\n",
      "Group Var   9.8e-01  1.3e+00  1.1e+00  8.8e-02\n"
     ]
    }
   ],
   "source": [
    "# Testing SBS40b with mixed models\n",
    "ASR_variable = 'ASR'\n",
    "signature = 'SBS1536A'\n",
    "# drop 5 thai cases\n",
    "# dataframe_to_regress = dataframe_to_regress[dataframe_to_regress.country != 'Czechia'].copy()\n",
    "# print(countries_with_ASR)\n",
    "# mod = smf.ols(formula='ASR ~ SBS288A + SBS288B + SBS288C + SBS288D + SBS288E + SBS288F + SBS288G + SBS288H + SBS288K ', data=countries_with_ASR)\n",
    "# print(merged_sigs_with_metadata.columns)\n",
    "# print(len(dataframe_to_regress))\n",
    "# print(dataframe_to_regress.columns)\n",
    "dataframe_to_regress = dataframe_to_regress.dropna(subset=[ASR_variable])\n",
    "dataframe_to_regress[ASR_variable] = pd.to_numeric(dataframe_to_regress[ASR_variable])\n",
    "dataframe_to_regress[signature + '_bool'] = 0\n",
    "dataframe_to_regress.loc[dataframe_to_regress[signature + '_CI']>0, signature + '_bool'] = 1\n",
    "# print(dataframe_to_regress[signature + '_bool'])\n",
    "\n",
    "# logistic regression\n",
    "dataframe_to_regress_locally = dataframe_to_regress[[signature + '_bool', ASR_variable, 'age_diag', 'country', 'sex' ]] # 'stage', 'bmi_q', 'hypert', 'county', 'age_diag', 'bmi_q', 'diabetes', 'rec_period', 'tobacco_ever', 'tobacco_ever', 'hypert', 'diabetes']]\n",
    "logit = smf.logit(formula='%s_bool ~ %s + %s age_diag ' % (signature, ASR_variable, 'sex + ' if not (males_only or females_only) else ''), data=dataframe_to_regress_locally)\n",
    "linear_fit = logit.fit()\n",
    "print(linear_fit.summary())\n",
    "params = linear_fit.params\n",
    "# ORs with confidence intervals\n",
    "conf = linear_fit.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "conf = np.exp(conf)\n",
    "pvalues_dataframe = linear_fit.pvalues.to_frame()\n",
    "conf['p-value'] = pvalues_dataframe.copy()\n",
    "print(conf)\n",
    "\n",
    "# linear regression\n",
    "dataframe_to_regress_locally = dataframe_to_regress[[signature + '_CI', ASR_variable, 'age_diag', 'country', 'sex' ]] # 'stage', 'bmi_q', 'hypert', 'county', 'age_diag', 'bmi_q', 'diabetes', 'rec_period', 'tobacco_ever', 'tobacco_ever', 'hypert', 'diabetes']]\n",
    "# dataframe_to_regress_locally = dataframe_to_regress_locally.replace('missing', np.nan)\n",
    "# dataframe_to_regress_locally = dataframe_to_regress_locally.replace('Missing', np.nan)\n",
    "# print(dataframe_to_regress)\n",
    "mod = smf.ols(formula='%s_CI ~ %s + %s age_diag ' % (signature, ASR_variable, 'sex + ' if not (males_only or females_only) else ''), data=dataframe_to_regress_locally)\n",
    "# mod = smf.ols(formula='%s ~ country + age_diag + sex + bmi_q + tobacco_ever + hypert + diabetes  ' % ASR_variable, data=dataframe_to_regress)\n",
    "# mod = smf.ols(formula='%s ~ age_diag + sex + country + stage' % ASR_variable, data=dataframe_to_regress)\n",
    "linear_fit = mod.fit()\n",
    "print(linear_fit.summary())\n",
    "params = linear_fit.params\n",
    "# ORs with confidence intervals\n",
    "conf = linear_fit.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "conf = np.exp(conf)\n",
    "pvalues_dataframe = linear_fit.pvalues.to_frame()\n",
    "conf['p-value'] = pvalues_dataframe.copy()\n",
    "print(conf)\n",
    "\n",
    "# Quasi-poisson regression\n",
    "# from statsmodels.genmod.generalized_linear_model import GLM\n",
    "# from statsmodels.genmod.families import Poisson\n",
    "# from statsmodels.genmod import families\n",
    "dataframe_to_regress_locally = dataframe_to_regress[[signature + '_CI', ASR_variable, 'age_diag', 'country', 'sex' ]] # 'stage', 'bmi_q', 'hypert', 'county', 'age_diag', 'bmi_q', 'diabetes', 'rec_period', 'tobacco_ever', 'tobacco_ever', 'hypert', 'diabetes']]\n",
    "# dataframe_to_regress_locally = dataframe_to_regress_locally.replace('missing', np.nan)\n",
    "# dataframe_to_regress_locally = dataframe_to_regress_locally.replace('Missing', np.nan)\n",
    "# print(dataframe_to_regress)\n",
    "mod = smf.glm(formula='%s_CI ~ %s + %s age_diag ' % (signature, ASR_variable, 'sex + ' if not (males_only or females_only) else ''), data=dataframe_to_regress_locally, family = sm.families.Poisson())\n",
    "# mod = smf.ols(formula='%s ~ country + age_diag + sex + bmi_q + tobacco_ever + hypert + diabetes  ' % ASR_variable, data=dataframe_to_regress)\n",
    "# mod = smf.ols(formula='%s ~ age_diag + sex + country + stage' % ASR_variable, data=dataframe_to_regress)\n",
    "quasipoisson_fit = mod.fit(cov_type='HC1')\n",
    "print(quasipoisson_fit.summary())\n",
    "params = quasipoisson_fit.params\n",
    "# ORs with confidence intervals\n",
    "conf = quasipoisson_fit.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "conf = np.exp(conf)\n",
    "pvalues_dataframe = quasipoisson_fit.pvalues.to_frame()\n",
    "conf['p-value'] = pvalues_dataframe.copy()\n",
    "print(conf)\n",
    "\n",
    "from statsmodels.formula.api import mixedlm\n",
    "from pandas.api.types import is_numeric_dtype\n",
    "stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)\n",
    "\n",
    "def lrtest(llmin, llmax):\n",
    "    lr = 2 * (llmax - llmin)\n",
    "    p = stats.chisqprob(lr, 1) # llmax has 1 dof more than llmin\n",
    "    return lr, p\n",
    "\n",
    "mle_random_intercept = mixedlm(\"%s_CI ~ %s + %s age_diag\" % (signature, ASR_variable, 'sex + ' if not (males_only or females_only) else ''), dataframe_to_regress_locally, groups=dataframe_to_regress_locally['country'])\n",
    "mle_random_intercept_fit = mle_random_intercept.fit(maxiter=1000)\n",
    "print(mle_random_intercept_fit.summary())\n",
    "print(mle_random_intercept.score(mle_random_intercept_fit.params_object))\n",
    "# lr, lr_p = lrtest(linear_fit.llf, mle_random_slope_fit.llf)\n",
    "# print(lr, lr_p)\n",
    "lr, lr_p = lrtest(linear_fit.llf, mle_random_intercept_fit.llf)\n",
    "print(lr, lr_p)\n",
    "params = mle_random_intercept_fit.params \n",
    "# ORs with confidence intervals\n",
    "conf = mle_random_intercept_fit.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "conf = np.exp(conf)\n",
    "pvalues_dataframe = mle_random_intercept_fit.pvalues.to_frame()\n",
    "conf['p-value'] = pvalues_dataframe.copy()\n",
    "print(conf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Figure size 252x167.76 with 0 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Incidence and PRS instruments\n",
    "from matplotlib.pyplot import figure\n",
    "from tools.data_handling import calculate_p_value, latexify_p_value\n",
    "sns.set(rc={'figure.figsize':(3.5,2.33)})\n",
    "sns.set_context(\"paper\", rc={\"font.size\":7,\"axes.titlesize\":7,\"axes.labelsize\":7})\n",
    "sns.color_palette(\"viridis\")\n",
    "sns.set_style('whitegrid', rc={\n",
    "    'xtick.bottom': True,\n",
    "    'ytick.left': True,\n",
    "})\n",
    "labels = {\n",
    "'RCC_risk_QC':'RCC risk',\n",
    "'BMI':'BMI',\n",
    "'SBP':'Systolic blood pressure',\n",
    "'DBP':'Diastolic blood pressure',\n",
    "'FG_combined':'Fasting glucose',\n",
    "'FI_combined':'Fasting insulin',\n",
    "'SmkInit':'Tobacco smoking',\n",
    "}\n",
    "def text_coordinates(ax):\n",
    "    return ((ax.get_xlim()[1] - ax.get_xlim()[0])*0.010 + ax.get_xlim()[0],\n",
    "            (ax.get_ylim()[1] - ax.get_ylim()[0])*1.03 + ax.get_ylim()[0])\n",
    "# sns.set_theme(style=\"whitegrid\")\n",
    "dataframe_to_regress = merged_sigs_with_metadata.dropna(subset=['ASR'])\n",
    "dataframe_to_regress['ASR'] = pd.to_numeric(dataframe_to_regress['ASR'])\n",
    "dataframe_to_regress_locally = dataframe_to_regress.copy()\n",
    "dataframe_to_regress_locally = dataframe_to_regress_locally.sort_values([\"ASR\"])\n",
    "dataframe_to_regress_locally = dataframe_to_regress_locally[dataframe_to_regress_locally.country != 'Thailand']\n",
    "dataframe_to_regress_locally = dataframe_to_regress_locally[dataframe_to_regress_locally.country != 'Japan']\n",
    "for PRS_variable in ['RCC_risk_QC', 'SBP', 'DBP', 'FG_combined', 'FI_combined', 'BMI', 'SmkInit']:\n",
    "    # insert PRS for ccRCC risk\n",
    "    PRS = pd.read_csv('./PRS/%s_0.1_Final.all_score' % PRS_variable,index_col=1, sep = ' ')\n",
    "    PRS_mean = PRS['Pt_1'].mean()\n",
    "    PRS['Pt_1'] = PRS['Pt_1'] - PRS_mean\n",
    "    PRS.index = PRS.index.str.replace(r'b', 'a')\n",
    "    PRS.drop(['FID'],axis=1,inplace=True)\n",
    "    PRS['Pt_1'] = pd.to_numeric(PRS['Pt_1'])\n",
    "    PRS.rename(columns={'Pt_1':PRS_variable},inplace=True)\n",
    "    if not PRS_variable in dataframe_to_regress_locally.columns:\n",
    "        dataframe_to_regress_locally = pd.concat([PRS, dataframe_to_regress_locally], axis=1)#.fillna(0)\n",
    "    ax = sns.boxplot(x=\"country\", y=PRS_variable, data=dataframe_to_regress_locally,order=['Brazil','Serbia','Romania','Poland','UK','Canada','Russia','Czechia','Lithuania'])\n",
    "    p = calculate_p_value(dataframe_to_regress_locally,'country',PRS_variable)\n",
    "#     ax.set(ylim=(0, 30000))\n",
    "    ax.text(text_coordinates(ax)[0], text_coordinates(ax)[1], r\"Kruskal-Wallis, $p = %.1e$\" % p)\n",
    "    # Rotate the tick labels and set their alignment\n",
    "    plt.setp(ax.get_xticklabels(), rotation=45, ha=\"right\", rotation_mode=\"anchor\")\n",
    "    ax.set(xlabel=None)\n",
    "    ax.set(ylabel=labels[PRS_variable] + ' PRS')\n",
    "    plt.tight_layout()\n",
    "    plt.savefig(\"/Users/senkins/work/RCC_analysis/PRS/%s.pdf\" % PRS_variable)\n",
    "    plt.clf()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "945\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                    ASR   R-squared:                       0.321\n",
      "Model:                            OLS   Adj. R-squared:                  0.314\n",
      "Method:                 Least Squares   F-statistic:                     48.13\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           1.27e-63\n",
      "Time:                        17:19:27   Log-Likelihood:                -1893.9\n",
      "No. Observations:                 824   AIC:                             3806.\n",
      "Df Residuals:                     815   BIC:                             3848.\n",
      "Df Model:                           8                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept       3.6866      0.665      5.546      0.000       2.382       4.991\n",
      "sex[T.Male]     0.3060      0.175      1.750      0.081      -0.037       0.649\n",
      "RCC_risk_QC     0.1866      0.178      1.045      0.296      -0.164       0.537\n",
      "age_diag        0.0052      0.008      0.694      0.488      -0.010       0.020\n",
      "PC1           488.3135     55.137      8.856      0.000     380.086     596.541\n",
      "PC2          -127.8457     75.523     -1.693      0.091    -276.089      20.397\n",
      "PC3           -46.2703      6.512     -7.105      0.000     -59.053     -33.488\n",
      "PC4            40.8873     18.760      2.180      0.030       4.065      77.710\n",
      "PC5            36.2862     22.194      1.635      0.102      -7.279      79.851\n",
      "==============================================================================\n",
      "Omnibus:                       45.056   Durbin-Watson:                   0.799\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               19.248\n",
      "Skew:                           0.121   Prob(JB):                     6.61e-05\n",
      "Kurtosis:                       2.292   Cond. No.                     5.97e+04\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 5.97e+04. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n",
      "                2.5%    97.5%       OR  p-value\n",
      "Intercept    2.4e+00  5.0e+00  3.7e+00  4.0e-08\n",
      "sex[T.Male] -3.7e-02  6.5e-01  3.1e-01  8.1e-02\n",
      "RCC_risk_QC -1.6e-01  5.4e-01  1.9e-01  3.0e-01\n",
      "age_diag    -9.5e-03  2.0e-02  5.2e-03  4.9e-01\n",
      "PC1          3.8e+02  6.0e+02  4.9e+02  5.1e-18\n",
      "PC2         -2.8e+02  2.0e+01 -1.3e+02  9.1e-02\n",
      "PC3         -5.9e+01 -3.3e+01 -4.6e+01  2.6e-12\n",
      "PC4          4.1e+00  7.8e+01  4.1e+01  3.0e-02\n",
      "PC5         -7.3e+00  8.0e+01  3.6e+01  1.0e-01\n",
      "945\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                    ASR   R-squared:                       0.320\n",
      "Model:                            OLS   Adj. R-squared:                  0.313\n",
      "Method:                 Least Squares   F-statistic:                     47.94\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           2.11e-63\n",
      "Time:                        17:19:27   Log-Likelihood:                -1894.4\n",
      "No. Observations:                 824   AIC:                             3807.\n",
      "Df Residuals:                     815   BIC:                             3849.\n",
      "Df Model:                           8                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept       3.7150      0.665      5.590      0.000       2.410       5.020\n",
      "sex[T.Male]     0.3010      0.175      1.716      0.086      -0.043       0.645\n",
      "BMI             0.0532      0.209      0.255      0.799      -0.357       0.463\n",
      "age_diag        0.0047      0.008      0.623      0.533      -0.010       0.019\n",
      "PC1           487.8704     55.174      8.842      0.000     379.571     596.170\n",
      "PC2          -130.9900     75.515     -1.735      0.083    -279.217      17.237\n",
      "PC3           -46.2043      6.524     -7.082      0.000     -59.011     -33.398\n",
      "PC4            39.9656     18.756      2.131      0.033       3.149      76.782\n",
      "PC5            35.5246     22.227      1.598      0.110      -8.104      79.153\n",
      "==============================================================================\n",
      "Omnibus:                       46.061   Durbin-Watson:                   0.799\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               19.454\n",
      "Skew:                           0.120   Prob(JB):                     5.96e-05\n",
      "Kurtosis:                       2.287   Cond. No.                     5.96e+04\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 5.96e+04. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n",
      "                2.5%    97.5%       OR  p-value\n",
      "Intercept    2.4e+00  5.0e+00  3.7e+00  3.1e-08\n",
      "sex[T.Male] -4.3e-02  6.5e-01  3.0e-01  8.6e-02\n",
      "BMI         -3.6e-01  4.6e-01  5.3e-02  8.0e-01\n",
      "age_diag    -1.0e-02  1.9e-02  4.7e-03  5.3e-01\n",
      "PC1          3.8e+02  6.0e+02  4.9e+02  5.7e-18\n",
      "PC2         -2.8e+02  1.7e+01 -1.3e+02  8.3e-02\n",
      "PC3         -5.9e+01 -3.3e+01 -4.6e+01  3.1e-12\n",
      "PC4          3.1e+00  7.7e+01  4.0e+01  3.3e-02\n",
      "PC5         -8.1e+00  7.9e+01  3.6e+01  1.1e-01\n",
      "945\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                    ASR   R-squared:                       0.320\n",
      "Model:                            OLS   Adj. R-squared:                  0.313\n",
      "Method:                 Least Squares   F-statistic:                     47.97\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           1.96e-63\n",
      "Time:                        17:19:27   Log-Likelihood:                -1894.3\n",
      "No. Observations:                 824   AIC:                             3807.\n",
      "Df Residuals:                     815   BIC:                             3849.\n",
      "Df Model:                           8                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept       3.7227      0.665      5.600      0.000       2.418       5.028\n",
      "sex[T.Male]     0.3068      0.175      1.752      0.080      -0.037       0.651\n",
      "SBP             0.1911      0.411      0.464      0.643      -0.617       0.999\n",
      "age_diag        0.0048      0.008      0.636      0.525      -0.010       0.020\n",
      "PC1           486.7971     55.197      8.819      0.000     378.453     595.141\n",
      "PC2          -130.7059     75.508     -1.731      0.084    -278.920      17.508\n",
      "PC3           -46.3118      6.516     -7.107      0.000     -59.102     -33.522\n",
      "PC4            39.9587     18.753      2.131      0.033       3.149      76.769\n",
      "PC5            35.9440     22.204      1.619      0.106      -7.639      79.527\n",
      "==============================================================================\n",
      "Omnibus:                       44.524   Durbin-Watson:                   0.799\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               19.019\n",
      "Skew:                           0.117   Prob(JB):                     7.41e-05\n",
      "Kurtosis:                       2.294   Cond. No.                     5.96e+04\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 5.96e+04. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n",
      "                2.5%    97.5%       OR  p-value\n",
      "Intercept    2.4e+00  5.0e+00  3.7e+00  2.9e-08\n",
      "sex[T.Male] -3.7e-02  6.5e-01  3.1e-01  8.0e-02\n",
      "SBP         -6.2e-01  1.0e+00  1.9e-01  6.4e-01\n",
      "age_diag    -1.0e-02  2.0e-02  4.8e-03  5.2e-01\n",
      "PC1          3.8e+02  6.0e+02  4.9e+02  6.9e-18\n",
      "PC2         -2.8e+02  1.8e+01 -1.3e+02  8.4e-02\n",
      "PC3         -5.9e+01 -3.4e+01 -4.6e+01  2.6e-12\n",
      "PC4          3.1e+00  7.7e+01  4.0e+01  3.3e-02\n",
      "PC5         -7.6e+00  8.0e+01  3.6e+01  1.1e-01\n",
      "945\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                    ASR   R-squared:                       0.320\n",
      "Model:                            OLS   Adj. R-squared:                  0.314\n",
      "Method:                 Least Squares   F-statistic:                     47.99\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           1.89e-63\n",
      "Time:                        17:19:27   Log-Likelihood:                -1894.3\n",
      "No. Observations:                 824   AIC:                             3807.\n",
      "Df Residuals:                     815   BIC:                             3849.\n",
      "Df Model:                           8                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept       3.7260      0.665      5.604      0.000       2.421       5.031\n",
      "sex[T.Male]     0.3070      0.175      1.753      0.080      -0.037       0.651\n",
      "DBP             0.1745      0.326      0.535      0.593      -0.466       0.815\n",
      "age_diag        0.0048      0.008      0.633      0.527      -0.010       0.019\n",
      "PC1           485.8791     55.264      8.792      0.000     377.402     594.356\n",
      "PC2          -132.3121     75.550     -1.751      0.080    -280.608      15.983\n",
      "PC3           -46.4159      6.520     -7.119      0.000     -59.214     -33.618\n",
      "PC4            39.7685     18.758      2.120      0.034       2.948      76.589\n",
      "PC5            35.7054     22.201      1.608      0.108      -7.872      79.283\n",
      "==============================================================================\n",
      "Omnibus:                       45.098   Durbin-Watson:                   0.798\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               19.214\n",
      "Skew:                           0.120   Prob(JB):                     6.73e-05\n",
      "Kurtosis:                       2.291   Cond. No.                     5.97e+04\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 5.97e+04. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n",
      "                2.5%    97.5%       OR  p-value\n",
      "Intercept    2.4e+00  5.0e+00  3.7e+00  2.9e-08\n",
      "sex[T.Male] -3.7e-02  6.5e-01  3.1e-01  8.0e-02\n",
      "DBP         -4.7e-01  8.1e-01  1.7e-01  5.9e-01\n",
      "age_diag    -1.0e-02  1.9e-02  4.8e-03  5.3e-01\n",
      "PC1          3.8e+02  5.9e+02  4.9e+02  8.6e-18\n",
      "PC2         -2.8e+02  1.6e+01 -1.3e+02  8.0e-02\n",
      "PC3         -5.9e+01 -3.4e+01 -4.6e+01  2.4e-12\n",
      "PC4          2.9e+00  7.7e+01  4.0e+01  3.4e-02\n",
      "PC5         -7.9e+00  7.9e+01  3.6e+01  1.1e-01\n",
      "945\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                    ASR   R-squared:                       0.324\n",
      "Model:                            OLS   Adj. R-squared:                  0.318\n",
      "Method:                 Least Squares   F-statistic:                     48.89\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           1.67e-64\n",
      "Time:                        17:19:27   Log-Likelihood:                -1891.8\n",
      "No. Observations:                 824   AIC:                             3802.\n",
      "Df Residuals:                     815   BIC:                             3844.\n",
      "Df Model:                           8                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept       3.6099      0.664      5.436      0.000       2.306       4.914\n",
      "sex[T.Male]     0.3029      0.174      1.736      0.083      -0.040       0.645\n",
      "FG_combined    -0.8187      0.358     -2.288      0.022      -1.521      -0.116\n",
      "age_diag        0.0049      0.007      0.655      0.513      -0.010       0.020\n",
      "PC1           495.1542     55.092      8.988      0.000     387.016     603.292\n",
      "PC2          -132.0172     75.278     -1.754      0.080    -279.778      15.743\n",
      "PC3           -46.5442      6.497     -7.164      0.000     -59.297     -33.792\n",
      "PC4            40.3175     18.695      2.157      0.031       3.621      77.014\n",
      "PC5            35.9503     22.133      1.624      0.105      -7.494      79.395\n",
      "==============================================================================\n",
      "Omnibus:                       48.099   Durbin-Watson:                   0.803\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               19.879\n",
      "Skew:                           0.118   Prob(JB):                     4.82e-05\n",
      "Kurtosis:                       2.277   Cond. No.                     5.96e+04\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 5.96e+04. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n",
      "                2.5%    97.5%       OR  p-value\n",
      "Intercept    2.3e+00  4.9e+00  3.6e+00  7.2e-08\n",
      "sex[T.Male] -4.0e-02  6.5e-01  3.0e-01  8.3e-02\n",
      "FG_combined -1.5e+00 -1.2e-01 -8.2e-01  2.2e-02\n",
      "age_diag    -9.8e-03  2.0e-02  4.9e-03  5.1e-01\n",
      "PC1          3.9e+02  6.0e+02  5.0e+02  1.7e-18\n",
      "PC2         -2.8e+02  1.6e+01 -1.3e+02  8.0e-02\n",
      "PC3         -5.9e+01 -3.4e+01 -4.7e+01  1.8e-12\n",
      "PC4          3.6e+00  7.7e+01  4.0e+01  3.1e-02\n",
      "PC5         -7.5e+00  7.9e+01  3.6e+01  1.0e-01\n",
      "945\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                    ASR   R-squared:                       0.320\n",
      "Model:                            OLS   Adj. R-squared:                  0.313\n",
      "Method:                 Least Squares   F-statistic:                     47.94\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           2.13e-63\n",
      "Time:                        17:19:27   Log-Likelihood:                -1894.4\n",
      "No. Observations:                 824   AIC:                             3807.\n",
      "Df Residuals:                     815   BIC:                             3849.\n",
      "Df Model:                           8                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept       3.7122      0.665      5.584      0.000       2.407       5.017\n",
      "sex[T.Male]     0.3027      0.175      1.729      0.084      -0.041       0.646\n",
      "FI_combined    -0.4203      1.882     -0.223      0.823      -4.114       3.273\n",
      "age_diag        0.0047      0.008      0.630      0.529      -0.010       0.019\n",
      "PC1           488.1453     55.208      8.842      0.000     379.778     596.513\n",
      "PC2          -130.4709     75.539     -1.727      0.085    -278.746      17.804\n",
      "PC3           -46.1684      6.538     -7.062      0.000     -59.001     -33.335\n",
      "PC4            40.0623     18.754      2.136      0.033       3.250      76.875\n",
      "PC5            35.8462     22.205      1.614      0.107      -7.740      79.432\n",
      "==============================================================================\n",
      "Omnibus:                       44.036   Durbin-Watson:                   0.800\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               18.926\n",
      "Skew:                           0.118   Prob(JB):                     7.77e-05\n",
      "Kurtosis:                       2.296   Cond. No.                     5.97e+04\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 5.97e+04. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n",
      "                2.5%    97.5%       OR  p-value\n",
      "Intercept    2.4e+00  5.0e+00  3.7e+00  3.2e-08\n",
      "sex[T.Male] -4.1e-02  6.5e-01  3.0e-01  8.4e-02\n",
      "FI_combined -4.1e+00  3.3e+00 -4.2e-01  8.2e-01\n",
      "age_diag    -1.0e-02  1.9e-02  4.7e-03  5.3e-01\n",
      "PC1          3.8e+02  6.0e+02  4.9e+02  5.7e-18\n",
      "PC2         -2.8e+02  1.8e+01 -1.3e+02  8.5e-02\n",
      "PC3         -5.9e+01 -3.3e+01 -4.6e+01  3.5e-12\n",
      "PC4          3.2e+00  7.7e+01  4.0e+01  3.3e-02\n",
      "PC5         -7.7e+00  7.9e+01  3.6e+01  1.1e-01\n",
      "945\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                    ASR   R-squared:                       0.320\n",
      "Model:                            OLS   Adj. R-squared:                  0.314\n",
      "Method:                 Least Squares   F-statistic:                     48.01\n",
      "Date:                Tue, 06 Feb 2024   Prob (F-statistic):           1.79e-63\n",
      "Time:                        17:19:27   Log-Likelihood:                -1894.2\n",
      "No. Observations:                 824   AIC:                             3806.\n",
      "Df Residuals:                     815   BIC:                             3849.\n",
      "Df Model:                           8                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "===============================================================================\n",
      "                  coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept       3.6775      0.667      5.513      0.000       2.368       4.987\n",
      "sex[T.Male]     0.3059      0.175      1.748      0.081      -0.038       0.649\n",
      "SmkInit         0.2869      0.454      0.632      0.527      -0.604       1.177\n",
      "age_diag        0.0046      0.008      0.618      0.537      -0.010       0.019\n",
      "PC1           491.1963     55.437      8.860      0.000     382.381     600.012\n",
      "PC2          -130.3386     75.504     -1.726      0.085    -278.544      17.866\n",
      "PC3           -46.2071      6.516     -7.091      0.000     -58.998     -33.417\n",
      "PC4            39.6750     18.759      2.115      0.035       2.853      76.497\n",
      "PC5            35.0235     22.231      1.575      0.116      -8.614      78.661\n",
      "==============================================================================\n",
      "Omnibus:                       46.621   Durbin-Watson:                   0.799\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               19.573\n",
      "Skew:                           0.120   Prob(JB):                     5.62e-05\n",
      "Kurtosis:                       2.284   Cond. No.                     5.97e+04\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 5.97e+04. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n",
      "                2.5%    97.5%       OR  p-value\n",
      "Intercept    2.4e+00  5.0e+00  3.7e+00  4.7e-08\n",
      "sex[T.Male] -3.8e-02  6.5e-01  3.1e-01  8.1e-02\n",
      "SmkInit     -6.0e-01  1.2e+00  2.9e-01  5.3e-01\n",
      "age_diag    -1.0e-02  1.9e-02  4.6e-03  5.4e-01\n",
      "PC1          3.8e+02  6.0e+02  4.9e+02  4.9e-18\n",
      "PC2         -2.8e+02  1.8e+01 -1.3e+02  8.5e-02\n",
      "PC3         -5.9e+01 -3.3e+01 -4.6e+01  2.9e-12\n",
      "PC4          2.9e+00  7.6e+01  4.0e+01  3.5e-02\n",
      "PC5         -8.6e+00  7.9e+01  3.5e+01  1.2e-01\n"
     ]
    }
   ],
   "source": [
    "# PRS regressions\n",
    "dataframe_to_regress_locally = dataframe_to_regress.copy()\n",
    "# Add PCA components\n",
    "PCA_data = pd.read_csv('./germline_calls/PCA_AllccRCC_1000G.eigenvec', index_col=1, sep=' ')\n",
    "# rename PDids in index from b to a\n",
    "PCA_data.index = PCA_data.index.str.replace('b2', 'a')\n",
    "PCA_data.index = PCA_data.index.str.replace('b', 'a')\n",
    "PCA_data = PCA_data.query(\"ID == '0'\")\n",
    "PCA_data = PCA_data[['PC1','PC2','PC3','PC4','PC5']]\n",
    "if 'PC1' not in dataframe_to_regress_locally.columns:\n",
    "    dataframe_to_regress_locally = pd.merge(PCA_data, dataframe_to_regress_locally, left_index=True, right_index=True)\n",
    "qgrid.show_grid(merged_sigs_with_metadata, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})\n",
    "ASR_variable = 'ASR'\n",
    "PRS_results_dataframe = pd.DataFrame(index = list(labels.values()), columns=['2.5%', '97.5%', 'OR', 'p-value'])\n",
    "for PRS_variable in list(labels.keys()):\n",
    "    # insert PRS for ccRCC risk\n",
    "    PRS = pd.read_csv('./PRS/%s_0.1_Final.all_score' % PRS_variable,index_col=1, sep = ' ')\n",
    "    PRS_mean = PRS['Pt_1'].mean()\n",
    "    PRS['Pt_1'] = PRS['Pt_1'] - PRS_mean\n",
    "    PRS.index = PRS.index.str.replace(r'b', 'a')\n",
    "    PRS.drop(['FID'],axis=1,inplace=True)\n",
    "    PRS['Pt_1'] = pd.to_numeric(PRS['Pt_1'])\n",
    "    PRS.rename(columns={'Pt_1':PRS_variable},inplace=True)\n",
    "    if not PRS_variable in dataframe_to_regress_locally.columns:\n",
    "        dataframe_to_regress_locally = pd.concat([PRS, dataframe_to_regress_locally], axis=1)#.fillna(0)\n",
    "    # sex = 'Male'\n",
    "    sex = 'All'\n",
    "    # drop Thai and Japanese cases\n",
    "    dataframe_to_regress_locally = dataframe_to_regress_locally[dataframe_to_regress_locally.country != 'Thailand'].copy()\n",
    "    dataframe_to_regress_locally = dataframe_to_regress_locally[dataframe_to_regress_locally.country != 'Japan'].copy()\n",
    "    if sex != 'All':\n",
    "        dataframe_to_regress_locally = dataframe_to_regress_locally[dataframe_to_regress_locally['sex']==sex].copy()\n",
    "    print(len(dataframe_to_regress_locally))\n",
    "    dataframe_to_regress_locally = dataframe_to_regress_locally.dropna(subset=[ASR_variable])\n",
    "    dataframe_to_regress_locally[ASR_variable] = pd.to_numeric(dataframe_to_regress_locally[ASR_variable])\n",
    "    # linear regression\n",
    "    dataframe_to_regress_locally = dataframe_to_regress_locally[[PRS_variable] + [ASR_variable, 'age_diag', 'country', 'sex', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']] # 'stage', 'bmi_q', 'hypert', 'county', 'age_diag', 'bmi_q', 'diabetes', 'rec_period', 'tobacco_ever', 'tobacco_ever', 'hypert', 'diabetes']]\n",
    "    # dataframe_to_regress_locally = dataframe_to_regress_locally.replace('missing', np.nan)\n",
    "    # dataframe_to_regress_locally = dataframe_to_regress_locally.replace('Missing', np.nan)\n",
    "    # print(dataframe_to_regress)\n",
    "    mod = smf.ols(formula='%s ~ %s + sex + age_diag + PC1 + PC2 + PC3 + PC4 + PC5 ' % (ASR_variable, PRS_variable), data=dataframe_to_regress_locally)\n",
    "    linear_fit = mod.fit()\n",
    "    print(linear_fit.summary())\n",
    "    params = linear_fit.params\n",
    "    # ORs with confidence intervals\n",
    "    conf = linear_fit.conf_int()\n",
    "    conf['OR'] = params\n",
    "    conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "#     conf = np.exp(conf)\n",
    "    pvalues_dataframe = linear_fit.pvalues.to_frame()\n",
    "    conf['p-value'] = pvalues_dataframe.copy()\n",
    "    print(conf)\n",
    "    PRS_results_dataframe.loc[labels[PRS_variable],:] = conf.loc[PRS_variable]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                         Effect size (95% CI)  p-value\n",
      "RCC risk                    0.19 (-0.16-0.54)  3.0e-01\n",
      "BMI                         0.05 (-0.36-0.46)  8.0e-01\n",
      "Systolic blood pressure     0.19 (-0.62-1.00)  6.4e-01\n",
      "Diastolic blood pressure    0.17 (-0.47-0.81)  5.9e-01\n",
      "Fasting glucose           -0.82 (-1.52--0.12)  2.2e-02\n",
      "Fasting insulin            -0.42 (-4.11-3.27)  8.2e-01\n",
      "Tobacco smoking             0.29 (-0.60-1.18)  5.3e-01\n"
     ]
    }
   ],
   "source": [
    "# printing out PRS results tables\n",
    "results_dataframe_to_print = PRS_results_dataframe.copy()\n",
    "results_dataframe_to_print['2.5%'] = pd.to_numeric(results_dataframe_to_print['2.5%'], errors='coerce')\n",
    "results_dataframe_to_print['97.5%'] = pd.to_numeric(results_dataframe_to_print['97.5%'], errors='coerce')\n",
    "results_dataframe_to_print['OR'] = pd.to_numeric(results_dataframe_to_print['OR'], errors='coerce')\n",
    "pd.options.display.float_format = '{:.1e}'.format\n",
    "if len(results_dataframe_to_print.index)>0:\n",
    "    results_dataframe_to_print = results_dataframe_to_print.apply(lambda x: [f'{y:.2f}' for y in x] if ('p-value' not in x.name) else x)\n",
    "    results_dataframe_to_print['p-value'] = results_dataframe_to_print['p-value'].astype(float)\n",
    "#     results_dataframe_to_print['p-value (corr)'] = results_dataframe_to_print['p-value (corr)'].astype(float)\n",
    "    results_dataframe_to_print['Effect size (95% CI)'] = results_dataframe_to_print['OR'] + ' (' + results_dataframe_to_print['2.5%'] + '-' + results_dataframe_to_print['97.5%'] + ')'\n",
    "    results_dataframe_to_print.drop(['2.5%','97.5%','OR'], axis=1, inplace=True)\n",
    "    columns = list(sorted(results_dataframe_to_print.columns.values))\n",
    "    results_dataframe_to_print = results_dataframe_to_print[columns]\n",
    "    results_dataframe_to_print['Effect size (95% CI)'] = results_dataframe_to_print['Effect size (95% CI)'].replace({'nan (nan-nan)':'-'})\n",
    "for mutation_type in mutation_types:\n",
    "    results_dataframe_to_print.rename(index={mutation_type + '_burden':mutation_type + ' burden'},inplace=True)\n",
    "results_dataframe_to_print.to_csv('./incidence/ASR_tables/PRS_results_vs_ASR_results.csv', float_format='%.1e')\n",
    "print(results_dataframe_to_print)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                Effect size (95% CI) Regression type  p-value  p-value (corr)\n",
      "Signature                                                                    \n",
      "SBS1              -1.28 (-7.57-5.00)          linear  6.9e-01         1.0e+00\n",
      "SBS2                1.15 (0.87-1.52)        logistic  3.2e-01         1.0e+00\n",
      "SBS4                1.06 (1.01-1.10)        logistic  8.3e-03         2.4e-01\n",
      "SBS5                1.01 (0.94-1.09)        logistic  7.5e-01         1.0e+00\n",
      "SBS12               0.82 (0.75-0.90)        logistic  6.4e-05         1.8e-03\n",
      "SBS13               1.06 (1.00-1.12)        logistic  5.4e-02         1.0e+00\n",
      "SBS18               1.03 (0.95-1.11)        logistic  4.7e-01         1.0e+00\n",
      "SBS22a              0.72 (0.66-0.78)        logistic  7.4e-14         2.1e-12\n",
      "SBS22b              0.73 (0.68-0.79)        logistic  1.5e-15         4.2e-14\n",
      "SBS40a           28.60 (10.29-46.92)          linear  2.2e-03         6.5e-02\n",
      "SBS40b           62.24 (48.40-76.08)          linear  5.1e-18         1.5e-16\n",
      "SBS40c              0.99 (0.93-1.05)        logistic  7.7e-01         1.0e+00\n",
      "SBS44               0.83 (0.64-1.07)        logistic  1.5e-01         1.0e+00\n",
      "DBS2                1.03 (0.99-1.07)        logistic  1.8e-01         1.0e+00\n",
      "DBS4                1.11 (1.03-1.19)        logistic  4.4e-03         1.3e-01\n",
      "DBS9                1.06 (0.95-1.18)        logistic  3.3e-01         1.0e+00\n",
      "DBS20               0.85 (0.78-0.92)        logistic  7.9e-05         2.3e-03\n",
      "DBS_C               1.02 (0.96-1.09)        logistic  5.1e-01         1.0e+00\n",
      "ID1               -1.29 (-5.09-2.52)          linear  5.1e-01         1.0e+00\n",
      "ID2                 1.07 (0.91-1.26)        logistic  4.4e-01         1.0e+00\n",
      "ID3                 0.93 (0.84-1.02)        logistic  1.2e-01         1.0e+00\n",
      "ID5              14.61 (10.20-19.03)          linear  1.3e-10         3.9e-09\n",
      "ID8                 1.10 (1.05-1.16)        logistic  7.1e-05         2.1e-03\n",
      "ID11                1.17 (0.92-1.50)        logistic  2.0e-01         1.0e+00\n",
      "ID12                0.81 (0.63-1.04)        logistic  1.0e-01         1.0e+00\n",
      "ID23                0.77 (0.66-0.90)        logistic  1.1e-03         3.1e-02\n",
      "SBS burden  -102.07 (-173.09--31.06)          linear  4.9e-03         1.4e-01\n",
      "DBS burden          0.65 (0.05-1.24)          linear  3.3e-02         9.5e-01\n",
      "ID burden         9.32 (-5.81-24.45)          linear  2.3e-01         1.0e+00\n"
     ]
    }
   ],
   "source": [
    "# printing out the main results tables\n",
    "results_dataframe_to_print = results_dataframe.copy()\n",
    "results_dataframe_to_print['2.5%'] = pd.to_numeric(results_dataframe_to_print['2.5%'], errors='coerce')\n",
    "results_dataframe_to_print['97.5%'] = pd.to_numeric(results_dataframe_to_print['97.5%'], errors='coerce')\n",
    "results_dataframe_to_print['OR'] = pd.to_numeric(results_dataframe_to_print['OR'], errors='coerce')\n",
    "pd.options.display.float_format = '{:.1e}'.format\n",
    "if len(results_dataframe_to_print.index)>0:\n",
    "    results_dataframe_to_print = results_dataframe_to_print.apply(lambda x: [f'{y:.2f}' for y in x] if ('p-value' not in x.name and x.name!='Regression type') else x)\n",
    "    results_dataframe_to_print['p-value'] = results_dataframe_to_print['p-value'].astype(float)\n",
    "    results_dataframe_to_print['p-value (corr)'] = results_dataframe_to_print['p-value (corr)'].astype(float)\n",
    "    results_dataframe_to_print['Effect size (95% CI)'] = results_dataframe_to_print['OR'] + ' (' + results_dataframe_to_print['2.5%'] + '-' + results_dataframe_to_print['97.5%'] + ')'\n",
    "    results_dataframe_to_print.drop(['2.5%','97.5%','OR'], axis=1, inplace=True)\n",
    "    columns = list(sorted(results_dataframe_to_print.columns.values))\n",
    "    results_dataframe_to_print = results_dataframe_to_print[columns]\n",
    "    results_dataframe_to_print['Effect size (95% CI)'] = results_dataframe_to_print['Effect size (95% CI)'].replace({'nan (nan-nan)':'-'})\n",
    "for mutation_type in mutation_types:\n",
    "    results_dataframe_to_print.rename(index={mutation_type + '_burden':mutation_type + ' burden'},inplace=True)\n",
    "\n",
    "if analysis=='COSMIC' and not 'rand' in run_name:\n",
    "    results_dataframe_to_print.rename(index={'SBS1536A':'SBS40b','SBS1536B':'SBS40a','SBS1536F':'SBS40c','SBS22':'SBS22a','SBS1536I':'SBS22b','DBS78D':'DBS20','ID83C':'ID23'},inplace=True)\n",
    "    if 'SBS44' in results_dataframe_to_print.index:\n",
    "        results_dataframe_to_print.loc['SBS44', :], results_dataframe_to_print.loc['SBS22b', :] = results_dataframe_to_print.loc['SBS22b'].copy(), results_dataframe_to_print.loc['SBS44'].copy()\n",
    "        results_dataframe_to_print = results_dataframe_to_print.rename({'SBS44': 'SBS22b', 'SBS22b': 'SBS44'})\n",
    "    if 'SBS40a' in results_dataframe_to_print.index:\n",
    "        results_dataframe_to_print.loc['SBS40a', :], results_dataframe_to_print.loc['SBS40b', :] = results_dataframe_to_print.loc['SBS40b'].copy(), results_dataframe_to_print.loc['SBS40a'].copy()\n",
    "        results_dataframe_to_print = results_dataframe_to_print.rename({'SBS40a': 'SBS40b', 'SBS40b': 'SBS40a'}, axis='rows')\n",
    "    if 'DBS20' in results_dataframe_to_print.index:\n",
    "        results_dataframe_to_print.loc['DBS20', :], results_dataframe_to_print.loc['DBS78C', :] = results_dataframe_to_print.loc['DBS78C'].copy(), results_dataframe_to_print.loc['DBS20'].copy()\n",
    "        results_dataframe_to_print = results_dataframe_to_print.rename({'DBS20': 'DBS78C', 'DBS78C': 'DBS20'}, axis='rows')\n",
    "results_dataframe_to_print.rename(index={'DBS78A':'DBS_A','DBS78B':'DBS_B','DBS78C':'DBS_C','DBS78D':'DBS_D'},inplace=True)\n",
    "results_dataframe_to_print.rename(index={'ID83A':'ID_A','ID83B':'ID_B','ID83C':'ID_C','ID83D':'ID_D','ID83E':'ID_E','ID83F':'ID_F','ID83G':'ID_G'},inplace=True)\n",
    "results_dataframe_to_print.rename(index={'SBS1536A':'SBS_A','SBS1536B':'SBS_B','SBS1536C':'SBS_C','SBS1536D':'SBS_D','SBS1536E':'SBS_E','SBS1536F':'SBS_F','SBS1536G':'SBS_G','SBS1536H':'SBS_H','SBS1536I':'SBS_I','SBS1536J':'SBS_J','SBS1536K':'SBS_K','SBS1536L':'SBS_L','SBS1536M':'SBS_M'},inplace=True)\n",
    "results_dataframe_to_print.to_csv('./incidence/ASR_tables/' + run_name + '_' + analysis + '_results.csv', float_format='%.1e')\n",
    "print(results_dataframe_to_print)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
